!--------------------------------------------------------------------------------------------------!
!   CP2K: A general program to perform molecular dynamics simulations                              !
!   Copyright 2000-2024 CP2K developers group <https://cp2k.org>                                   !
!                                                                                                  !
!   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
!--------------------------------------------------------------------------------------------------!

! **************************************************************************************************
!> \brief Routines to calculate derivatives with respect to basis function origin
!> \par History
!>      04.2008 created [Manuel Guidon]
!> \author Manuel Guidon
! **************************************************************************************************
MODULE hfx_derivatives
   USE admm_types, ONLY: admm_type
   USE atomic_kind_types, ONLY: atomic_kind_type, &
                                get_atomic_kind_set
   USE cell_types, ONLY: cell_type, &
                         pbc
   USE cp_control_types, ONLY: dft_control_type
   USE cp_files, ONLY: close_file, &
                       open_file
   USE cp_log_handling, ONLY: cp_get_default_logger, &
                              cp_logger_type
   USE cp_output_handling, ONLY: cp_p_file, &
                                 cp_print_key_finished_output, &
                                 cp_print_key_should_output, &
                                 cp_print_key_unit_nr
   USE message_passing, ONLY: mp_para_env_type
   USE cp_dbcsr_api, ONLY: dbcsr_get_matrix_type, &
                           dbcsr_p_type, &
                           dbcsr_type_antisymmetric
   USE gamma, ONLY: init_md_ftable
   USE hfx_communication, ONLY: get_full_density
   USE hfx_compression_methods, ONLY: hfx_add_mult_cache_elements, &
                                      hfx_add_single_cache_element, &
                                      hfx_decompress_first_cache, &
                                      hfx_flush_last_cache, &
                                      hfx_get_mult_cache_elements, &
                                      hfx_get_single_cache_element, &
                                      hfx_reset_cache_and_container
   USE hfx_libint_interface, ONLY: evaluate_deriv_eri
   USE hfx_load_balance_methods, ONLY: collect_load_balance_info, &
                                       hfx_load_balance, &
                                       hfx_update_load_balance
   USE hfx_pair_list_methods, ONLY: build_atomic_pair_list, &
                                    build_pair_list, &
                                    build_pair_list_pgf, &
                                    build_pgf_product_list, &
                                    pgf_product_list_size
   USE hfx_screening_methods, ONLY: update_pmax_mat
   USE hfx_types, ONLY: &
      alloc_containers, dealloc_containers, hfx_basis_info_type, hfx_basis_type, hfx_cache_type, &
      hfx_cell_type, hfx_container_type, hfx_distribution, hfx_general_type, hfx_init_container, &
      hfx_load_balance_type, hfx_memory_type, hfx_p_kind, hfx_pgf_list, hfx_pgf_product_list, &
      hfx_potential_type, hfx_screen_coeff_type, hfx_screening_type, hfx_task_list_type, &
      hfx_type, init_t_c_g0_lmax, log_zero, pair_list_type, pair_set_list_type
   USE input_constants, ONLY: do_potential_mix_cl_trunc, &
                              do_potential_truncated, &
                              hfx_do_eval_forces
   USE input_section_types, ONLY: section_vals_type
   USE kinds, ONLY: dp, &
                    int_8
   USE libint_wrapper, ONLY: cp_libint_t
   USE machine, ONLY: m_walltime
   USE mathconstants, ONLY: fac
   USE orbital_pointers, ONLY: nco, &
                               ncoset, &
                               nso
   USE particle_types, ONLY: particle_type
   USE qs_environment_types, ONLY: get_qs_env, &
                                   qs_environment_type
   USE qs_force_types, ONLY: qs_force_type
   USE t_c_g0, ONLY: init
   USE util, ONLY: sort
   USE virial_types, ONLY: virial_type

!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads

#include "./base/base_uses.f90"

   IMPLICIT NONE
   PRIVATE

   PUBLIC :: derivatives_four_center

   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'hfx_derivatives'

!***

CONTAINS

! **************************************************************************************************
!> \brief computes four center derivatives for a full basis set and updates the
!>      forces%fock_4c arrays. Uses all 8 eri symmetries
!> \param qs_env ...
!> \param rho_ao density matrix
!> \param rho_ao_resp relaxed density matrix from response
!> \param hfx_section HFX input section
!> \param para_env para_env
!> \param irep ID of HFX replica
!> \param use_virial ...
!> \param adiabatic_rescale_factor parameter used for MCY3 hybrid
!> \param resp_only Calculate only forces from response density
!> \par History
!>      06.2007 created [Manuel Guidon]
!>      08.2007 optimized load balance [Manuel Guidon]
!>      02.2009 completely rewritten screening part [Manuel Guidon]
!>      12.2017 major bug fix. removed wrong cycle that was causing segfault.
!>              see https://groups.google.com/forum/#!topic/cp2k/pc6B14XOALY
!>              [Tobias Binninger + Valery Weber]
!> \author Manuel Guidon
! **************************************************************************************************
   SUBROUTINE derivatives_four_center(qs_env, rho_ao, rho_ao_resp, hfx_section, para_env, &
                                      irep, use_virial, adiabatic_rescale_factor, resp_only, &
                                      external_x_data)

      TYPE(qs_environment_type), POINTER                 :: qs_env
      TYPE(dbcsr_p_type), DIMENSION(:, :), POINTER       :: rho_ao
      TYPE(dbcsr_p_type), DIMENSION(:), POINTER          :: rho_ao_resp
      TYPE(section_vals_type), POINTER                   :: hfx_section
      TYPE(mp_para_env_type), POINTER                    :: para_env
      INTEGER, INTENT(IN)                                :: irep
      LOGICAL, INTENT(IN)                                :: use_virial
      REAL(dp), INTENT(IN), OPTIONAL                     :: adiabatic_rescale_factor
      LOGICAL, INTENT(IN), OPTIONAL                      :: resp_only
      TYPE(hfx_type), DIMENSION(:, :), POINTER, OPTIONAL  :: external_x_data

      CHARACTER(LEN=*), PARAMETER :: routineN = 'derivatives_four_center'

      INTEGER :: atomic_offset_ac, atomic_offset_ad, atomic_offset_bc, atomic_offset_bd, bin, &
                 bits_max_val, buffer_left, buffer_size, buffer_start, cache_size, coord, current_counter, &
                 forces_map(4, 2), handle, handle_bin, handle_getP, handle_load, handle_main, i, i_atom, &
                 i_list_ij, i_list_kl, i_set_list_ij, i_set_list_ij_start, i_set_list_ij_stop, &
                 i_set_list_kl, i_set_list_kl_start, i_set_list_kl_stop, i_thread, iatom, iatom_block, &
                 iatom_end, iatom_start, ikind, iset, iw, j, j_atom, jatom, jatom_block, jatom_end, &
                 jatom_start, jkind, jset, k, k_atom, katom, katom_block, katom_end, katom_start
      INTEGER :: kind_kind_idx, kkind, kset, l_atom, l_max, latom, latom_block, latom_end, &
                 latom_start, lkind, lset, max_am, max_pgf, max_set, my_bin_id, my_bin_size, my_thread_id, &
                 n_threads, natom, nbits, ncob, ncos_max, nints, nkimages, nkind, nneighbors, nseta, &
                 nsetb, nsgf_max, nspins, sgfb, shm_task_counter, shm_total_bins, sphi_a_u1, sphi_a_u2, &
                 sphi_a_u3, sphi_b_u1, sphi_b_u2, sphi_b_u3, sphi_c_u1, sphi_c_u2, sphi_c_u3, sphi_d_u1, &
                 sphi_d_u2, sphi_d_u3, swap_id, tmp_i4, unit_id
      INTEGER(int_8) :: atom_block, counter, estimate_to_store_int, max_val_memory, &
                        mem_compression_counter, mem_eris, mem_max_val, my_current_counter, my_istart, &
                        n_processes, nblocks, ncpu, neris_incore, neris_onthefly, neris_tmp, neris_total, &
                        nprim_ints, shm_neris_incore, shm_neris_onthefly, shm_neris_total, shm_nprim_ints, &
                        shm_stor_count_max_val, shm_storage_counter_integrals, stor_count_max_val, &
                        storage_counter_integrals, tmp_block, tmp_i8(6)
      INTEGER(int_8), ALLOCATABLE, DIMENSION(:)          :: tmp_task_list_cost
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: atom_of_kind, kind_of, last_sgf_global, &
                                                            nimages, tmp_index
      INTEGER, DIMENSION(:), POINTER :: la_max, la_min, lb_max, lb_min, lc_max, lc_min, ld_max, &
                                        ld_min, npgfa, npgfb, npgfc, npgfd, nsgfa, nsgfb, nsgfc, nsgfd, shm_block_offset
      INTEGER, DIMENSION(:, :), POINTER :: first_sgfb, nsgfl_a, nsgfl_b, nsgfl_c, nsgfl_d, &
                                           offset_ac_set, offset_ad_set, offset_bc_set, offset_bd_set, shm_atomic_block_offset
      INTEGER, DIMENSION(:, :), POINTER, SAVE            :: shm_is_assoc_atomic_block
      INTEGER, DIMENSION(:, :, :, :), POINTER            :: shm_set_offset
      INTEGER, SAVE                                      :: shm_number_of_p_entries
      LOGICAL :: bins_left, buffer_overflow, do_dynamic_load_balancing, do_it, do_periodic, &
                 do_print_load_balance_info, is_anti_symmetric, my_resp_only, screen_pmat_forces, &
                 treat_forces_in_core, use_disk_storage, with_resp_density
      LOGICAL, DIMENSION(:, :), POINTER                  :: shm_atomic_pair_list
      REAL(dp) :: bintime_start, bintime_stop, cartesian_estimate, cartesian_estimate_virial, &
                  compression_factor, eps_schwarz, eps_storage, fac, hf_fraction, ln_10, log10_eps_schwarz, &
                  log10_pmax, log_2, max_contraction_val, max_val1, max_val2, max_val2_set, &
                  my_adiabatic_rescale_factor, pmax_atom, pmax_blocks, pmax_entry, pmax_tmp, ra(3), rab2, &
                  rb(3), rc(3), rcd2, rd(3), spherical_estimate, spherical_estimate_virial, symm_fac, &
                  tmp_virial(3, 3)
      REAL(dp), ALLOCATABLE, DIMENSION(:) :: ede_buffer1, ede_buffer2, ede_primitives_tmp, &
                                             ede_primitives_tmp_virial, ede_work, ede_work2, ede_work2_virial, ede_work_forces, &
                                             ede_work_virial, pac_buf, pac_buf_beta, pac_buf_resp, pac_buf_resp_beta, pad_buf, &
                                             pad_buf_beta, pad_buf_resp, pad_buf_resp_beta, pbc_buf, pbc_buf_beta, pbc_buf_resp, &
                                             pbc_buf_resp_beta, pbd_buf, pbd_buf_beta, pbd_buf_resp, pbd_buf_resp_beta
      REAL(dp), ALLOCATABLE, DIMENSION(:), TARGET        :: primitive_forces, primitive_forces_virial
      REAL(dp), DIMENSION(:), POINTER                    :: full_density_resp, &
                                                            full_density_resp_beta, T2
      REAL(dp), DIMENSION(:, :), POINTER :: full_density_alpha, full_density_beta, &
                                            max_contraction, ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, shm_pmax_atom, shm_pmax_block, &
                                            sphi_b, zeta, zetb, zetc, zetd
      REAL(dp), DIMENSION(:, :, :), POINTER              :: sphi_a_ext_set, sphi_b_ext_set, &
                                                            sphi_c_ext_set, sphi_d_ext_set
      REAL(dp), DIMENSION(:, :, :, :), POINTER           :: sphi_a_ext, sphi_b_ext, sphi_c_ext, &
                                                            sphi_d_ext
      REAL(KIND=dp)                                      :: coeffs_kind_max0
      TYPE(admm_type), POINTER                           :: admm_env
      TYPE(atomic_kind_type), DIMENSION(:), POINTER      :: atomic_kind_set
      TYPE(cell_type), POINTER                           :: cell
      TYPE(cp_libint_t)                                  :: private_deriv
      TYPE(cp_logger_type), POINTER                      :: logger
      TYPE(dft_control_type), POINTER                    :: dft_control
      TYPE(hfx_basis_info_type), POINTER                 :: basis_info
      TYPE(hfx_basis_type), DIMENSION(:), POINTER        :: basis_parameter
      TYPE(hfx_cache_type), DIMENSION(:), POINTER        :: integral_caches
      TYPE(hfx_cache_type), POINTER                      :: maxval_cache
      TYPE(hfx_container_type), DIMENSION(:), POINTER    :: integral_containers
      TYPE(hfx_container_type), POINTER                  :: maxval_container
      TYPE(hfx_distribution), POINTER                    :: distribution_forces
      TYPE(hfx_general_type)                             :: general_parameter
      TYPE(hfx_load_balance_type), POINTER               :: load_balance_parameter
      TYPE(hfx_memory_type), POINTER                     :: memory_parameter
      TYPE(hfx_p_kind), DIMENSION(:), POINTER            :: shm_initial_p
      TYPE(hfx_pgf_list), ALLOCATABLE, DIMENSION(:)      :: pgf_list_ij, pgf_list_kl
      TYPE(hfx_pgf_product_list), ALLOCATABLE, &
         DIMENSION(:)                                    :: pgf_product_list
      TYPE(hfx_potential_type)                           :: potential_parameter
      TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
         POINTER                                         :: screen_coeffs_kind, tmp_R_1, tmp_R_2, &
                                                            tmp_screen_pgf1, tmp_screen_pgf2
      TYPE(hfx_screen_coeff_type), &
         DIMENSION(:, :, :, :), POINTER                  :: screen_coeffs_set
      TYPE(hfx_screen_coeff_type), &
         DIMENSION(:, :, :, :, :, :), POINTER            :: radii_pgf, screen_coeffs_pgf
      TYPE(hfx_screening_type)                           :: screening_parameter
      TYPE(hfx_task_list_type), DIMENSION(:), POINTER    :: shm_task_list, tmp_task_list
      TYPE(hfx_type), POINTER                            :: actual_x_data
      TYPE(hfx_type), DIMENSION(:, :), POINTER            :: my_x_data
      TYPE(pair_list_type)                               :: list_ij, list_kl
      TYPE(pair_set_list_type), ALLOCATABLE, &
         DIMENSION(:)                                    :: set_list_ij, set_list_kl
      TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
      TYPE(qs_force_type), DIMENSION(:), POINTER         :: force
      TYPE(virial_type), POINTER                         :: virial

      NULLIFY (dft_control, admm_env)

      with_resp_density = .FALSE.
      IF (ASSOCIATED(rho_ao_resp)) with_resp_density = .TRUE.

      my_resp_only = .FALSE.
      IF (PRESENT(resp_only)) my_resp_only = resp_only

      is_anti_symmetric = dbcsr_get_matrix_type(rho_ao(1, 1)%matrix) .EQ. dbcsr_type_antisymmetric

      CALL get_qs_env(qs_env, &
                      admm_env=admm_env, &
                      particle_set=particle_set, &
                      atomic_kind_set=atomic_kind_set, &
                      cell=cell, &
                      dft_control=dft_control)

      nspins = dft_control%nspins
      nkimages = dft_control%nimages
      CPASSERT(nkimages == 1)

      my_x_data => qs_env%x_data
      IF (PRESENT(external_x_data)) my_x_data => external_x_data

      !! One atom systems have no contribution to forces
      IF (SIZE(particle_set, 1) == 1) THEN
         IF (.NOT. use_virial) RETURN
      END IF

      CALL timeset(routineN, handle)
      !! Calculate l_max used in fgamma , because init_md_ftable is definitely not thread safe
      nkind = SIZE(atomic_kind_set, 1)
      l_max = 0
      DO ikind = 1, nkind
         l_max = MAX(l_max, MAXVAL(my_x_data(1, 1)%basis_parameter(ikind)%lmax))
      END DO
      l_max = 4*l_max + 1
      CALL init_md_ftable(l_max)

      IF (my_x_data(1, 1)%potential_parameter%potential_type == do_potential_truncated .OR. &
          my_x_data(1, 1)%potential_parameter%potential_type == do_potential_mix_cl_trunc) THEN
         IF (l_max > init_t_c_g0_lmax) THEN
            CALL open_file(unit_number=unit_id, file_name=my_x_data(1, 1)%potential_parameter%filename)
            CALL init(l_max, unit_id, para_env%mepos, para_env)
            CALL close_file(unit_id)
            init_t_c_g0_lmax = l_max
         END IF
      END IF

      n_threads = 1
!$    n_threads = omp_get_max_threads()

      shm_neris_total = 0
      shm_nprim_ints = 0
      shm_neris_onthefly = 0
      shm_storage_counter_integrals = 0
      shm_neris_incore = 0
      shm_stor_count_max_val = 0

      !! get force array
      CALL get_qs_env(qs_env=qs_env, force=force, virial=virial)

      my_adiabatic_rescale_factor = 1.0_dp
      IF (PRESENT(adiabatic_rescale_factor)) THEN
         my_adiabatic_rescale_factor = adiabatic_rescale_factor
      END IF

      !$OMP PARALLEL DEFAULT(OMP_DEFAULT_NONE_WITH_OOP) &
!$OMP SHARED(qs_env,&
!$OMP                                  rho_ao,&
!$OMP                                  rho_ao_resp,&
!$OMP                                  with_resp_density,&
!$OMP                                  hfx_section,&
!$OMP                                  para_env,&
!$OMP                                  irep,&
!$OMP                                  my_resp_only,&
!$OMP                                  ncoset,&
!$OMP                                  nco,&
!$OMP                                  nso,&
!$OMP                                  n_threads,&
!$OMP                                  full_density_alpha,&
!$OMP                                  full_density_resp,&
!$OMP                                  full_density_resp_beta,&
!$OMP                                  full_density_beta,&
!$OMP                                  shm_initial_p,&
!$OMP                                  shm_is_assoc_atomic_block,&
!$OMP                                  shm_number_of_p_entries,&
!$OMP                                  force,&
!$OMP                                  virial,&
!$OMP                                  my_adiabatic_rescale_factor,&
!$OMP                                  shm_neris_total,&
!$OMP                                  shm_neris_onthefly,&
!$OMP                                  shm_storage_counter_integrals,&
!$OMP                                  shm_neris_incore,&
!$OMP                                  shm_nprim_ints,&
!$OMP                                  shm_stor_count_max_val,&
!$OMP                                  cell,&
!$OMP                                  screen_coeffs_set,&
!$OMP                                  screen_coeffs_kind,&
!$OMP                                  screen_coeffs_pgf,&
!$OMP                                  pgf_product_list_size,&
!$OMP                                  nkind,&
!$OMP                                  is_anti_symmetric,&
!$OMP                                  radii_pgf,&
!$OMP                                  shm_atomic_block_offset,&
!$OMP                                  shm_set_offset,&
!$OMP                                  shm_block_offset,&
!$OMP                                  shm_task_counter,&
!$OMP                                  shm_task_list,&
!$OMP                                  shm_total_bins,&
!$OMP                                  shm_pmax_block,&
!$OMP                                  shm_atomic_pair_list,&
!$OMP                                  shm_pmax_atom,&
!$OMP                                  use_virial,&
!$OMP                                  do_print_load_balance_info,&
!$OMP                                  nkimages,nspins,my_x_data)&
!$OMP  PRIVATE(actual_x_data,atomic_kind_set,atomic_offset_ac,atomic_offset_ad,atomic_offset_bc,&
!$OMP  atomic_offset_bd,atom_of_kind,basis_info,&
!$OMP  basis_parameter,bin,bins_left,bintime_start,bintime_stop,bits_max_val,buffer_left,buffer_overflow,buffer_size,&
!$OMP  buffer_start,cache_size,cartesian_estimate,cartesian_estimate_virial,&
!$OMP  coeffs_kind_max0,compression_factor,counter,current_counter,&
!$OMP  distribution_forces,do_dynamic_load_balancing,do_it,do_periodic,ede_buffer1,ede_buffer2,&
!$OMP  ede_primitives_tmp,ede_primitives_tmp_virial,&
!$OMP  ede_work,ede_work2,ede_work2_virial,ede_work_forces,ede_work_virial,eps_schwarz,eps_storage,estimate_to_store_int,&
!$OMP  fac,first_sgfb,forces_map,general_parameter,handle_bin,handle_getp,handle_load,&
!$OMP  handle_main,hf_fraction,i_atom,iatom_end,iatom_start,ikind,integral_caches,integral_containers,&
!$OMP  i_set_list_ij_start,i_set_list_ij_stop,i_set_list_kl_start,i_set_list_kl_stop,i_thread,iw,j_atom,jatom_end,&
!$OMP  jatom_start,jkind,katom,k_atom,katom_block,katom_end,katom_start,kind_kind_idx,&
!$OMP  kind_of,kkind,kset,la_max,la_min,last_sgf_global,latom,l_atom,&
!$OMP  latom_block,latom_end,latom_start,lb_max,lb_min,lc_max,lc_min,ld_max,&
!$OMP  ld_min,list_ij,list_kl,lkind,ln_10,load_balance_parameter,log10_eps_schwarz,log10_pmax,&
!$OMP  log_2,logger,lset,max_am,max_contraction,max_contraction_val,max_pgf,max_set,&
!$OMP  max_val1,max_val2,max_val2_set,maxval_cache,maxval_container,max_val_memory,mem_compression_counter,mem_eris,&
!$OMP  mem_max_val,memory_parameter,my_bin_id,my_bin_size,my_current_counter,my_istart,my_thread_id,natom,&
!$OMP  nbits,nblocks,ncob,ncos_max,ncpu,neris_incore,neris_onthefly,neris_tmp,&
!$OMP  neris_total,nimages,nints,nneighbors,npgfa,npgfb,npgfc,npgfd,&
!$OMP  nprim_ints,n_processes,nseta,nsetb,nsgfa,nsgfb,nsgfc,nsgfd,&
!$OMP  nsgfl_a,nsgfl_b,nsgfl_c,nsgfl_d,nsgf_max,offset_ac_set,offset_ad_set,&
!$OMP  offset_bc_set,offset_bd_set,pac_buf,pac_buf_beta,pac_buf_resp,pac_buf_resp_beta,pad_buf,pad_buf_beta,pad_buf_resp,&
!$OMP  pad_buf_resp_beta,particle_set,pbc_buf,pbc_buf_beta,pbc_buf_resp,pbc_buf_resp_beta,pbd_buf,pbd_buf_beta, &
!$OMP  pbd_buf_resp, pbd_buf_resp_beta, pgf_list_ij,&
!$OMP  pgf_list_kl,pgf_product_list,pmax_atom,pmax_blocks,pmax_entry,pmax_tmp,potential_parameter,primitive_forces,&
!$OMP  primitive_forces_virial,private_deriv,ptr_p_1,ptr_p_2,ptr_p_3,ptr_p_4,ra,rab2,&
!$OMP  rb,rc,rcd2,rd,screening_parameter,screen_pmat_forces,set_list_ij,set_list_kl,&
!$OMP  sgfb,spherical_estimate,spherical_estimate_virial,sphi_a_ext,sphi_a_ext_set,sphi_a_u1,sphi_a_u2,sphi_a_u3,&
!$OMP  sphi_b,sphi_b_ext,sphi_b_ext_set,sphi_b_u1,sphi_b_u2,sphi_b_u3,sphi_c_ext,sphi_c_ext_set,&
!$OMP  sphi_c_u1,sphi_c_u2,sphi_c_u3,sphi_d_ext,sphi_d_ext_set,sphi_d_u1,sphi_d_u2,sphi_d_u3,&
!$OMP  storage_counter_integrals,stor_count_max_val,swap_id,symm_fac,t2,tmp_block,tmp_i8, tmp_i4,&
!$OMP  tmp_index,tmp_r_1,tmp_r_2,tmp_screen_pgf1,tmp_screen_pgf2,tmp_task_list,tmp_task_list_cost,tmp_virial,&
!$OMP  treat_forces_in_core,use_disk_storage,zeta,zetb,zetc,zetd)

      i_thread = 0
!$    i_thread = omp_get_thread_num()

      ln_10 = LOG(10.0_dp)
      log_2 = LOG10(2.0_dp)
      actual_x_data => my_x_data(irep, i_thread + 1)
      do_periodic = actual_x_data%periodic_parameter%do_periodic

      screening_parameter = actual_x_data%screening_parameter
      general_parameter = actual_x_data%general_parameter
      potential_parameter = actual_x_data%potential_parameter
      basis_info => actual_x_data%basis_info

      load_balance_parameter => actual_x_data%load_balance_parameter
      basis_parameter => actual_x_data%basis_parameter

      ! IF(with_resp_density) THEN
      !   ! here we also do a copy of the original load_balance_parameter
      !   ! since if MP2 forces are required after the calculation of the HFX
      !   ! derivatives we need to calculate again the SCF energy.
      !   ALLOCATE(load_balance_parameter_energy)
      !   load_balance_parameter_energy = actual_x_data%load_balance_parameter
      ! END IF

      memory_parameter => actual_x_data%memory_parameter

      cache_size = memory_parameter%cache_size
      bits_max_val = memory_parameter%bits_max_val

      use_disk_storage = .FALSE.

      CALL get_qs_env(qs_env=qs_env, &
                      atomic_kind_set=atomic_kind_set, &
                      particle_set=particle_set)

      max_set = basis_info%max_set
      max_am = basis_info%max_am
      natom = SIZE(particle_set, 1)

      treat_forces_in_core = memory_parameter%treat_forces_in_core

      hf_fraction = general_parameter%fraction
      hf_fraction = hf_fraction*my_adiabatic_rescale_factor
      eps_schwarz = screening_parameter%eps_schwarz_forces
      IF (eps_schwarz < 0.0_dp) THEN
         log10_eps_schwarz = log_zero
      ELSE
         !! ** make eps_schwarz tighter in case of a stress calculation
         IF (use_virial) eps_schwarz = eps_schwarz/actual_x_data%periodic_parameter%R_max_stress
         log10_eps_schwarz = LOG10(eps_schwarz)
      END IF
      screen_pmat_forces = screening_parameter%do_p_screening_forces
      ! don't do density screening in case of response forces
      IF (with_resp_density) screen_pmat_forces = .FALSE.

      eps_storage = eps_schwarz*memory_parameter%eps_storage_scaling

      counter = 0_int_8

      !! Copy the libint_guy
      private_deriv = actual_x_data%lib_deriv

      !! Get screening parameter

      CALL get_atomic_kind_set(atomic_kind_set=atomic_kind_set, &
                               atom_of_kind=atom_of_kind, &
                               kind_of=kind_of)

      !! Create helper arrray for mapping local basis functions to global ones
      ALLOCATE (last_sgf_global(0:natom))
      last_sgf_global(0) = 0
      DO iatom = 1, natom
         ikind = kind_of(iatom)
         last_sgf_global(iatom) = last_sgf_global(iatom - 1) + basis_parameter(ikind)%nsgf_total
      END DO

      ALLOCATE (max_contraction(max_set, natom))

      max_contraction = 0.0_dp
      max_pgf = 0
      DO jatom = 1, natom
         jkind = kind_of(jatom)
         lb_max => basis_parameter(jkind)%lmax
         npgfb => basis_parameter(jkind)%npgf
         nsetb = basis_parameter(jkind)%nset
         first_sgfb => basis_parameter(jkind)%first_sgf
         sphi_b => basis_parameter(jkind)%sphi
         nsgfb => basis_parameter(jkind)%nsgf
         DO jset = 1, nsetb
            ! takes the primitive to contracted transformation into account
            ncob = npgfb(jset)*ncoset(lb_max(jset))
            sgfb = first_sgfb(1, jset)
            ! if the primitives are assumed to be all of max_val2, max_val2*p2s_b becomes
            ! the maximum value after multiplication with sphi_b
            max_contraction(jset, jatom) = MAXVAL((/(SUM(ABS(sphi_b(1:ncob, i))), i=sgfb, sgfb + nsgfb(jset) - 1)/))
            max_pgf = MAX(max_pgf, npgfb(jset))
         END DO
      END DO

      ncpu = para_env%num_pe
      n_processes = ncpu*n_threads

      !! initialize some counters
      neris_total = 0_int_8
      neris_incore = 0_int_8
      neris_onthefly = 0_int_8
      mem_eris = 0_int_8
      mem_max_val = 0_int_8
      compression_factor = 0.0_dp
      nprim_ints = 0_int_8
      neris_tmp = 0_int_8
      max_val_memory = 1_int_8

!$OMP MASTER
      !! Set pointer for is_assoc helper
      shm_is_assoc_atomic_block => actual_x_data%is_assoc_atomic_block
      shm_number_of_p_entries = actual_x_data%number_of_p_entries
      shm_atomic_block_offset => actual_x_data%atomic_block_offset
      shm_set_offset => actual_x_data%set_offset
      shm_block_offset => actual_x_data%block_offset

      NULLIFY (full_density_alpha)
      NULLIFY (full_density_beta)
      ALLOCATE (full_density_alpha(shm_block_offset(ncpu + 1), nkimages))

      !! Get the full density from all the processors
      CALL timeset(routineN//"_getP", handle_getP)
      CALL get_full_density(para_env, full_density_alpha(:, 1), rho_ao(1, 1)%matrix, shm_number_of_p_entries, &
                            shm_block_offset, &
                            kind_of, basis_parameter, get_max_vals_spin=.FALSE., &
                            antisymmetric=is_anti_symmetric)
      ! for now only closed shell case
      IF (with_resp_density) THEN
         NULLIFY (full_density_resp)
         ALLOCATE (full_density_resp(shm_block_offset(ncpu + 1)))
         CALL get_full_density(para_env, full_density_resp, rho_ao_resp(1)%matrix, shm_number_of_p_entries, &
                               shm_block_offset, &
                               kind_of, basis_parameter, get_max_vals_spin=.FALSE., &
                               antisymmetric=is_anti_symmetric)
      END IF

      IF (nspins == 2) THEN
         ALLOCATE (full_density_beta(shm_block_offset(ncpu + 1), 1))
         CALL get_full_density(para_env, full_density_beta(:, 1), rho_ao(2, 1)%matrix, shm_number_of_p_entries, &
                               shm_block_offset, &
                               kind_of, basis_parameter, get_max_vals_spin=.FALSE., &
                               antisymmetric=is_anti_symmetric)
         ! With resp density
         IF (with_resp_density) THEN
            NULLIFY (full_density_resp_beta)
            ALLOCATE (full_density_resp_beta(shm_block_offset(ncpu + 1)))
            CALL get_full_density(para_env, full_density_resp_beta, rho_ao_resp(2)%matrix, shm_number_of_p_entries, &
                                  shm_block_offset, &
                                  kind_of, basis_parameter, get_max_vals_spin=.FALSE., &
                                  antisymmetric=is_anti_symmetric)
         END IF

      END IF
      CALL timestop(handle_getP)

      !! Calculate max entries for screening on actual density. If screen_p_mat_forces = FALSE, the
      !! matrix is initialized to 1.0
      IF (screen_pmat_forces) THEN
         NULLIFY (shm_initial_p)
         shm_initial_p => actual_x_data%initial_p_forces
         shm_pmax_atom => actual_x_data%pmax_atom_forces
         IF (memory_parameter%recalc_forces) THEN
            CALL update_pmax_mat(shm_initial_p, &
                                 actual_x_data%map_atom_to_kind_atom, &
                                 actual_x_data%set_offset, &
                                 actual_x_data%atomic_block_offset, &
                                 shm_pmax_atom, &
                                 full_density_alpha, full_density_beta, &
                                 natom, kind_of, basis_parameter, &
                                 nkind, actual_x_data%is_assoc_atomic_block)
         END IF
      END IF

      ! restore as full density the HF density
      ! maybe in the future
      IF (with_resp_density .AND. .NOT. my_resp_only) THEN
         full_density_alpha(:, 1) = full_density_alpha(:, 1) - full_density_resp
         IF (nspins == 2) THEN
            full_density_beta(:, 1) = &
               full_density_beta(:, 1) - full_density_resp_beta
         END IF
         ! full_density_resp=full_density+full_density_resp
      END IF

      screen_coeffs_set => actual_x_data%screen_funct_coeffs_set
      screen_coeffs_kind => actual_x_data%screen_funct_coeffs_kind
      screen_coeffs_pgf => actual_x_data%screen_funct_coeffs_pgf
      radii_pgf => actual_x_data%pair_dist_radii_pgf
      shm_pmax_block => actual_x_data%pmax_block

!$OMP END MASTER
!$OMP BARRIER

!$OMP MASTER
      CALL timeset(routineN//"_load", handle_load)
!$OMP END MASTER
      !! Load balance the work
      IF (actual_x_data%memory_parameter%recalc_forces) THEN
         IF (actual_x_data%b_first_load_balance_forces) THEN
            CALL hfx_load_balance(actual_x_data, eps_schwarz, particle_set, max_set, para_env, &
                                  screen_coeffs_set, screen_coeffs_kind, &
                                  shm_is_assoc_atomic_block, do_periodic, &
                                  load_balance_parameter, kind_of, basis_parameter, shm_initial_p, &
                                  shm_pmax_atom, i_thread, n_threads, &
                                  cell, screen_pmat_forces, actual_x_data%map_atom_to_kind_atom, &
                                  nkind, hfx_do_eval_forces, shm_pmax_block, use_virial)
            actual_x_data%b_first_load_balance_forces = .FALSE.
         ELSE
            CALL hfx_update_load_balance(actual_x_data, para_env, &
                                         load_balance_parameter, &
                                         i_thread, n_threads, hfx_do_eval_forces)
         END IF
      END IF
!$OMP MASTER
      CALL timestop(handle_load)
!$OMP END MASTER

      !! precompute maximum nco and allocate scratch
      ncos_max = 0
      nsgf_max = 0
      DO iatom = 1, natom
         ikind = kind_of(iatom)
         nseta = basis_parameter(ikind)%nset
         npgfa => basis_parameter(ikind)%npgf
         la_max => basis_parameter(ikind)%lmax
         nsgfa => basis_parameter(ikind)%nsgf
         DO iset = 1, nseta
            ncos_max = MAX(ncos_max, ncoset(la_max(iset)))
            nsgf_max = MAX(nsgf_max, nsgfa(iset))
         END DO
      END DO

      !! Allocate work arrays
      ALLOCATE (primitive_forces(12*nsgf_max**4))
      primitive_forces = 0.0_dp

      ! ** Allocate buffers for pgf_lists
      nneighbors = SIZE(actual_x_data%neighbor_cells)
      ALLOCATE (pgf_list_ij(max_pgf**2))
      ALLOCATE (pgf_list_kl(max_pgf**2))
!$OMP     ATOMIC READ
      tmp_i4 = pgf_product_list_size
      ALLOCATE (pgf_product_list(tmp_i4))
      ALLOCATE (nimages(max_pgf**2))

      DO i = 1, max_pgf**2
         ALLOCATE (pgf_list_ij(i)%image_list(nneighbors))
         ALLOCATE (pgf_list_kl(i)%image_list(nneighbors))
      END DO

      ALLOCATE (pbd_buf(nsgf_max**2))
      ALLOCATE (pbc_buf(nsgf_max**2))
      ALLOCATE (pad_buf(nsgf_max**2))
      ALLOCATE (pac_buf(nsgf_max**2))
      IF (with_resp_density) THEN
         ALLOCATE (pbd_buf_resp(nsgf_max**2))
         ALLOCATE (pbc_buf_resp(nsgf_max**2))
         ALLOCATE (pad_buf_resp(nsgf_max**2))
         ALLOCATE (pac_buf_resp(nsgf_max**2))
      END IF
      IF (nspins == 2) THEN
         ALLOCATE (pbd_buf_beta(nsgf_max**2))
         ALLOCATE (pbc_buf_beta(nsgf_max**2))
         ALLOCATE (pad_buf_beta(nsgf_max**2))
         ALLOCATE (pac_buf_beta(nsgf_max**2))
         IF (with_resp_density) THEN
            ALLOCATE (pbd_buf_resp_beta(nsgf_max**2))
            ALLOCATE (pbc_buf_resp_beta(nsgf_max**2))
            ALLOCATE (pad_buf_resp_beta(nsgf_max**2))
            ALLOCATE (pac_buf_resp_beta(nsgf_max**2))
         END IF
      END IF

      ALLOCATE (ede_work(ncos_max**4*12))
      ALLOCATE (ede_work2(ncos_max**4*12))
      ALLOCATE (ede_work_forces(ncos_max**4*12))
      ALLOCATE (ede_buffer1(ncos_max**4))
      ALLOCATE (ede_buffer2(ncos_max**4))
      ALLOCATE (ede_primitives_tmp(nsgf_max**4))

      IF (use_virial) THEN
         ALLOCATE (primitive_forces_virial(12*nsgf_max**4*3))
         primitive_forces_virial = 0.0_dp
         ALLOCATE (ede_work_virial(ncos_max**4*12*3))
         ALLOCATE (ede_work2_virial(ncos_max**4*12*3))
         ALLOCATE (ede_primitives_tmp_virial(nsgf_max**4))
         tmp_virial = 0.0_dp
      ELSE
         ! dummy allocation
         ALLOCATE (primitive_forces_virial(1))
         primitive_forces_virial = 0.0_dp
         ALLOCATE (ede_work_virial(1))
         ALLOCATE (ede_work2_virial(1))
         ALLOCATE (ede_primitives_tmp_virial(1))
      END IF

      !! Start caluclating integrals of the form (ab|cd) or (ij|kl)
      !! In order to do so, there is a main four-loop structure that takes into account the two symmetries
      !!
      !!   (ab|cd) = (ba|cd) = (ab|dc) = (ba|dc)
      !!
      !! by iterating in the following way
      !!
      !! DO iatom=1,natom               and       DO katom=1,natom
      !!   DO jatom=iatom,natom                     DO latom=katom,natom
      !!
      !! The third symmetry
      !!
      !!  (ab|cd) = (cd|ab)
      !!
      !! is taken into account by the following criterion:
      !!
      !! IF(katom+latom<=iatom+jatom)  THEN
      !!   IF( ((iatom+jatom).EQ.(katom+latom) ) .AND.(katom<iatom)) CYCLE
      !!
      !! Furthermore, if iatom==jatom==katom==latom we cycle, because the derivatives are zero anyway.
      !!
      !! Depending on the degeneracy of an integral the exchange contribution is multiplied by a corresponding
      !! factor ( symm_fac ).
      !!
      !! If one quartet does not pass the screening we CYCLE on the outer most possible loop. Thats why we use
      !! different hierarchies of short range screening matrices.
      !!
      !! If we do a parallel run, each process owns a unique array of workloads. Here, a workload is
      !! defined as :
      !!
      !! istart, jstart, kstart, lstart, number_of_atom_quartets, initial_cpu_id
      !!
      !! This tells the process where to start the main loops and how many bunches of integrals it has to
      !! calculate. The original parallelization is a simple modulo distribution that is binned and
      !! optimized in the load_balance routines. Since the Monte Carlo routines can swap processors,
      !! we need to know which was the initial cpu_id.
      !! Furthermore, the indices iatom, jatom, katom, latom have to be set to istart, jstart, kstart and
      !! lstart only the first time the loop is executed. All subsequent loops have to start with one or
      !! iatom and katom respectively. Therefore, we use flags like first_j_loop etc.

!$OMP BARRIER
!$OMP MASTER
      CALL timeset(routineN//"_main", handle_main)
!$OMP END MASTER
!$OMP BARRIER

      coeffs_kind_max0 = MAXVAL(screen_coeffs_kind(:, :)%x(2))
      ALLOCATE (set_list_ij((max_set*load_balance_parameter%block_size)**2))
      ALLOCATE (set_list_kl((max_set*load_balance_parameter%block_size)**2))

!$OMP BARRIER

      IF (actual_x_data%memory_parameter%recalc_forces) THEN
         actual_x_data%distribution_forces%ram_counter = HUGE(distribution_forces%ram_counter)
         memory_parameter%ram_counter_forces = HUGE(memory_parameter%ram_counter_forces)
      END IF

      IF (treat_forces_in_core) THEN
         buffer_overflow = .FALSE.
      ELSE
         buffer_overflow = .TRUE.
      END IF

      do_dynamic_load_balancing = .TRUE.
      IF (n_threads == 1) do_dynamic_load_balancing = .FALSE.

      IF (do_dynamic_load_balancing) THEN
         my_bin_size = SIZE(actual_x_data%distribution_forces)
      ELSE
         my_bin_size = 1
      END IF
      !! We do not need the containers if MAX_MEM = 0
      IF (treat_forces_in_core) THEN
         !! IF new md step -> reinitialize containers
         IF (actual_x_data%memory_parameter%recalc_forces) THEN
            CALL dealloc_containers(actual_x_data%store_forces, memory_parameter%actual_memory_usage)
            CALL alloc_containers(actual_x_data%store_forces, my_bin_size)

            DO bin = 1, my_bin_size
               maxval_container => actual_x_data%store_forces%maxval_container(bin)
               integral_containers => actual_x_data%store_forces%integral_containers(:, bin)
               CALL hfx_init_container(maxval_container, memory_parameter%actual_memory_usage, .FALSE.)
               DO i = 1, 64
                  CALL hfx_init_container(integral_containers(i), memory_parameter%actual_memory_usage, .FALSE.)
               END DO
            END DO
         END IF

         !! Decompress the first cache for maxvals and integrals
         IF (.NOT. actual_x_data%memory_parameter%recalc_forces) THEN
            DO bin = 1, my_bin_size
               maxval_cache => actual_x_data%store_forces%maxval_cache(bin)
               maxval_container => actual_x_data%store_forces%maxval_container(bin)
               integral_caches => actual_x_data%store_forces%integral_caches(:, bin)
               integral_containers => actual_x_data%store_forces%integral_containers(:, bin)
               CALL hfx_decompress_first_cache(bits_max_val, maxval_cache, maxval_container, &
                                               memory_parameter%actual_memory_usage, .FALSE.)
               DO i = 1, 64
                  CALL hfx_decompress_first_cache(i, integral_caches(i), integral_containers(i), &
                                                  memory_parameter%actual_memory_usage, .FALSE.)
               END DO
            END DO
         END IF
      END IF

!$OMP BARRIER
!$OMP MASTER
      IF (do_dynamic_load_balancing) THEN
         ! ** Lets construct the task list
         shm_total_bins = 0
         DO i = 1, n_threads
            shm_total_bins = shm_total_bins + SIZE(my_x_data(irep, i)%distribution_forces)
         END DO
         ALLOCATE (tmp_task_list(shm_total_bins))
         shm_task_counter = 0
         DO i = 1, n_threads
            DO bin = 1, SIZE(my_x_data(irep, i)%distribution_forces)
               shm_task_counter = shm_task_counter + 1
               tmp_task_list(shm_task_counter)%thread_id = i
               tmp_task_list(shm_task_counter)%bin_id = bin
               tmp_task_list(shm_task_counter)%cost = my_x_data(irep, i)%distribution_forces(bin)%cost
            END DO
         END DO

         ! ** Now sort the task list
         ALLOCATE (tmp_task_list_cost(shm_total_bins))
         ALLOCATE (tmp_index(shm_total_bins))

         DO i = 1, shm_total_bins
            tmp_task_list_cost(i) = tmp_task_list(i)%cost
         END DO

         CALL sort(tmp_task_list_cost, shm_total_bins, tmp_index)

         ALLOCATE (actual_x_data%task_list(shm_total_bins))

         DO i = 1, shm_total_bins
            actual_x_data%task_list(i) = tmp_task_list(tmp_index(shm_total_bins - i + 1))
         END DO

         shm_task_list => actual_x_data%task_list
         shm_task_counter = 0

         DEALLOCATE (tmp_task_list_cost, tmp_index, tmp_task_list)
      END IF

      !! precalculate maximum density matrix elements in blocks
      shm_pmax_block => actual_x_data%pmax_block
      actual_x_data%pmax_block = 0.0_dp
      IF (screen_pmat_forces) THEN
         DO iatom_block = 1, SIZE(actual_x_data%blocks)
            iatom_start = actual_x_data%blocks(iatom_block)%istart
            iatom_end = actual_x_data%blocks(iatom_block)%iend
            DO jatom_block = 1, SIZE(actual_x_data%blocks)
               jatom_start = actual_x_data%blocks(jatom_block)%istart
               jatom_end = actual_x_data%blocks(jatom_block)%iend
               shm_pmax_block(iatom_block, jatom_block) = MAXVAL(shm_pmax_atom(iatom_start:iatom_end, jatom_start:jatom_end))
            END DO
         END DO
      END IF
      shm_atomic_pair_list => actual_x_data%atomic_pair_list_forces
      IF (actual_x_data%memory_parameter%recalc_forces) THEN
         CALL build_atomic_pair_list(natom, shm_atomic_pair_list, kind_of, basis_parameter, particle_set, &
                                     do_periodic, screen_coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, &
                                     actual_x_data%blocks)
      END IF

      my_bin_size = SIZE(actual_x_data%distribution_forces)
      DO bin = 1, my_bin_size
         actual_x_data%distribution_forces(bin)%time_forces = 0.0_dp
      END DO
!$OMP END MASTER
!$OMP BARRIER

      IF (treat_forces_in_core) THEN
         IF (my_bin_size > 0) THEN
            maxval_container => actual_x_data%store_forces%maxval_container(1)
            maxval_cache => actual_x_data%store_forces%maxval_cache(1)

            integral_containers => actual_x_data%store_forces%integral_containers(:, 1)
            integral_caches => actual_x_data%store_forces%integral_caches(:, 1)
         END IF
      END IF

      nblocks = load_balance_parameter%nblocks

      bins_left = .TRUE.
      do_it = .TRUE.
      bin = 0
      DO WHILE (bins_left)
         IF (.NOT. do_dynamic_load_balancing) THEN
            bin = bin + 1
            IF (bin > my_bin_size) THEN
               do_it = .FALSE.
               bins_left = .FALSE.
            ELSE
               do_it = .TRUE.
               bins_left = .TRUE.
               distribution_forces => actual_x_data%distribution_forces(bin)
            END IF
         ELSE
!$OMP CRITICAL(hfxderivatives_critical)
            shm_task_counter = shm_task_counter + 1
            IF (shm_task_counter <= shm_total_bins) THEN
               my_thread_id = shm_task_list(shm_task_counter)%thread_id
               my_bin_id = shm_task_list(shm_task_counter)%bin_id
               IF (treat_forces_in_core) THEN
                  maxval_cache => my_x_data(irep, my_thread_id)%store_forces%maxval_cache(my_bin_id)
                  maxval_container => my_x_data(irep, my_thread_id)%store_forces%maxval_container(my_bin_id)
                  integral_caches => my_x_data(irep, my_thread_id)%store_forces%integral_caches(:, my_bin_id)
                  integral_containers => my_x_data(irep, my_thread_id)%store_forces%integral_containers(:, my_bin_id)
               END IF
               distribution_forces => my_x_data(irep, my_thread_id)%distribution_forces(my_bin_id)
               do_it = .TRUE.
               bins_left = .TRUE.
               IF (actual_x_data%memory_parameter%recalc_forces) THEN
                  distribution_forces%ram_counter = HUGE(distribution_forces%ram_counter)
               END IF
               counter = 0_Int_8
            ELSE
               do_it = .FALSE.
               bins_left = .FALSE.
            END IF
!$OMP END CRITICAL(hfxderivatives_critical)
         END IF

         IF (.NOT. do_it) CYCLE
!$OMP MASTER
         CALL timeset(routineN//"_bin", handle_bin)
!$OMP END MASTER
         bintime_start = m_walltime()
         my_istart = distribution_forces%istart
         my_current_counter = 0
         IF (distribution_forces%number_of_atom_quartets == 0 .OR. &
             my_istart == -1_int_8) my_istart = nblocks**4
         atomic_blocks: DO atom_block = my_istart, nblocks**4 - 1, n_processes
            latom_block = INT(MODULO(atom_block, nblocks)) + 1
            tmp_block = atom_block/nblocks
            katom_block = INT(MODULO(tmp_block, nblocks)) + 1
            IF (latom_block < katom_block) CYCLE
            tmp_block = tmp_block/nblocks
            jatom_block = INT(MODULO(tmp_block, nblocks)) + 1
            tmp_block = tmp_block/nblocks
            iatom_block = INT(MODULO(tmp_block, nblocks)) + 1
            IF (jatom_block < iatom_block) CYCLE
            my_current_counter = my_current_counter + 1
            IF (my_current_counter > distribution_forces%number_of_atom_quartets) EXIT atomic_blocks

            iatom_start = actual_x_data%blocks(iatom_block)%istart
            iatom_end = actual_x_data%blocks(iatom_block)%iend
            jatom_start = actual_x_data%blocks(jatom_block)%istart
            jatom_end = actual_x_data%blocks(jatom_block)%iend
            katom_start = actual_x_data%blocks(katom_block)%istart
            katom_end = actual_x_data%blocks(katom_block)%iend
            latom_start = actual_x_data%blocks(latom_block)%istart
            latom_end = actual_x_data%blocks(latom_block)%iend

            pmax_blocks = MAX(shm_pmax_block(katom_block, iatom_block) + &
                              shm_pmax_block(latom_block, jatom_block), &
                              shm_pmax_block(latom_block, iatom_block) + &
                              shm_pmax_block(katom_block, jatom_block))

            IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE

            CALL build_pair_list(natom, list_ij, set_list_ij, iatom_start, iatom_end, &
                                 jatom_start, jatom_end, &
                                 kind_of, basis_parameter, particle_set, &
                                 do_periodic, screen_coeffs_set, screen_coeffs_kind, coeffs_kind_max0, &
                                 log10_eps_schwarz, cell, pmax_blocks, shm_atomic_pair_list)

            CALL build_pair_list(natom, list_kl, set_list_kl, katom_start, katom_end, &
                                 latom_start, latom_end, &
                                 kind_of, basis_parameter, particle_set, &
                                 do_periodic, screen_coeffs_set, screen_coeffs_kind, coeffs_kind_max0, &
                                 log10_eps_schwarz, cell, pmax_blocks, shm_atomic_pair_list)

            DO i_list_ij = 1, list_ij%n_element
               iatom = list_ij%elements(i_list_ij)%pair(1)
               jatom = list_ij%elements(i_list_ij)%pair(2)
               i_set_list_ij_start = list_ij%elements(i_list_ij)%set_bounds(1)
               i_set_list_ij_stop = list_ij%elements(i_list_ij)%set_bounds(2)
               ikind = list_ij%elements(i_list_ij)%kind_pair(1)
               jkind = list_ij%elements(i_list_ij)%kind_pair(2)
               ra = list_ij%elements(i_list_ij)%r1
               rb = list_ij%elements(i_list_ij)%r2
               rab2 = list_ij%elements(i_list_ij)%dist2

               la_max => basis_parameter(ikind)%lmax
               la_min => basis_parameter(ikind)%lmin
               npgfa => basis_parameter(ikind)%npgf
               nseta = basis_parameter(ikind)%nset
               zeta => basis_parameter(ikind)%zet
               nsgfa => basis_parameter(ikind)%nsgf
               sphi_a_ext => basis_parameter(ikind)%sphi_ext(:, :, :, :)
               nsgfl_a => basis_parameter(ikind)%nsgfl
               sphi_a_u1 = UBOUND(sphi_a_ext, 1)
               sphi_a_u2 = UBOUND(sphi_a_ext, 2)
               sphi_a_u3 = UBOUND(sphi_a_ext, 3)

               lb_max => basis_parameter(jkind)%lmax
               lb_min => basis_parameter(jkind)%lmin
               npgfb => basis_parameter(jkind)%npgf
               nsetb = basis_parameter(jkind)%nset
               zetb => basis_parameter(jkind)%zet
               nsgfb => basis_parameter(jkind)%nsgf
               sphi_b_ext => basis_parameter(jkind)%sphi_ext(:, :, :, :)
               nsgfl_b => basis_parameter(jkind)%nsgfl
               sphi_b_u1 = UBOUND(sphi_b_ext, 1)
               sphi_b_u2 = UBOUND(sphi_b_ext, 2)
               sphi_b_u3 = UBOUND(sphi_b_ext, 3)

               i_atom = atom_of_kind(iatom)
               forces_map(1, 1) = ikind
               forces_map(1, 2) = i_atom
               j_atom = atom_of_kind(jatom)
               forces_map(2, 1) = jkind
               forces_map(2, 2) = j_atom

               DO i_list_kl = 1, list_kl%n_element
                  katom = list_kl%elements(i_list_kl)%pair(1)
                  latom = list_kl%elements(i_list_kl)%pair(2)

                  !All four centers equivalent => zero-contribution
!VIIRAL            IF((iatom==jatom .AND. iatom==katom .AND. iatom==latom)) CYCLE
                  IF (.NOT. (katom + latom <= iatom + jatom)) CYCLE
                  IF (((iatom + jatom) .EQ. (katom + latom)) .AND. (katom < iatom)) CYCLE

                  i_set_list_kl_start = list_kl%elements(i_list_kl)%set_bounds(1)
                  i_set_list_kl_stop = list_kl%elements(i_list_kl)%set_bounds(2)
                  kkind = list_kl%elements(i_list_kl)%kind_pair(1)
                  lkind = list_kl%elements(i_list_kl)%kind_pair(2)
                  rc = list_kl%elements(i_list_kl)%r1
                  rd = list_kl%elements(i_list_kl)%r2
                  rcd2 = list_kl%elements(i_list_kl)%dist2

                  IF (screen_pmat_forces) THEN
                     pmax_atom = MAX(shm_pmax_atom(katom, iatom) + &
                                     shm_pmax_atom(latom, jatom), &
                                     shm_pmax_atom(latom, iatom) + &
                                     shm_pmax_atom(katom, jatom))
                  ELSE
                     pmax_atom = 0.0_dp
                  END IF

                  IF ((screen_coeffs_kind(jkind, ikind)%x(1)*rab2 + &
                       screen_coeffs_kind(jkind, ikind)%x(2)) + &
                      (screen_coeffs_kind(lkind, kkind)%x(1)*rcd2 + &
                       screen_coeffs_kind(lkind, kkind)%x(2)) + pmax_atom < log10_eps_schwarz) CYCLE

                  IF (.NOT. (shm_is_assoc_atomic_block(latom, iatom) >= 1 .AND. &
                             shm_is_assoc_atomic_block(katom, iatom) >= 1 .AND. &
                             shm_is_assoc_atomic_block(katom, jatom) >= 1 .AND. &
                             shm_is_assoc_atomic_block(latom, jatom) >= 1)) CYCLE
                  k_atom = atom_of_kind(katom)
                  forces_map(3, 1) = kkind
                  forces_map(3, 2) = k_atom

                  l_atom = atom_of_kind(latom)
                  forces_map(4, 1) = lkind
                  forces_map(4, 2) = l_atom

                  IF (nspins == 1) THEN
                     fac = 0.25_dp*hf_fraction
                  ELSE
                     fac = 0.5_dp*hf_fraction
                  END IF
                  !calculate symmetry_factor
                  symm_fac = 0.25_dp
                  IF (iatom == jatom) symm_fac = symm_fac*2.0_dp
                  IF (katom == latom) symm_fac = symm_fac*2.0_dp
                  IF (iatom == katom .AND. jatom == latom .AND. &
                      iatom /= jatom .AND. katom /= latom) symm_fac = symm_fac*2.0_dp
                  IF (iatom == katom .AND. iatom == jatom .AND. &
                      katom == latom) symm_fac = symm_fac*2.0_dp

                  symm_fac = 1.0_dp/symm_fac
                  fac = fac*symm_fac

                  lc_max => basis_parameter(kkind)%lmax
                  lc_min => basis_parameter(kkind)%lmin
                  npgfc => basis_parameter(kkind)%npgf
                  zetc => basis_parameter(kkind)%zet
                  nsgfc => basis_parameter(kkind)%nsgf
                  sphi_c_ext => basis_parameter(kkind)%sphi_ext(:, :, :, :)
                  nsgfl_c => basis_parameter(kkind)%nsgfl
                  sphi_c_u1 = UBOUND(sphi_c_ext, 1)
                  sphi_c_u2 = UBOUND(sphi_c_ext, 2)
                  sphi_c_u3 = UBOUND(sphi_c_ext, 3)

                  ld_max => basis_parameter(lkind)%lmax
                  ld_min => basis_parameter(lkind)%lmin
                  npgfd => basis_parameter(lkind)%npgf
                  zetd => basis_parameter(lkind)%zet
                  nsgfd => basis_parameter(lkind)%nsgf
                  sphi_d_ext => basis_parameter(lkind)%sphi_ext(:, :, :, :)
                  nsgfl_d => basis_parameter(lkind)%nsgfl
                  sphi_d_u1 = UBOUND(sphi_d_ext, 1)
                  sphi_d_u2 = UBOUND(sphi_d_ext, 2)
                  sphi_d_u3 = UBOUND(sphi_d_ext, 3)

                  atomic_offset_bd = shm_atomic_block_offset(jatom, latom)
                  atomic_offset_bc = shm_atomic_block_offset(jatom, katom)
                  atomic_offset_ad = shm_atomic_block_offset(iatom, latom)
                  atomic_offset_ac = shm_atomic_block_offset(iatom, katom)

                  IF (jatom < latom) THEN
                     offset_bd_set => shm_set_offset(:, :, lkind, jkind)
                  ELSE
                     offset_bd_set => shm_set_offset(:, :, jkind, lkind)
                  END IF
                  IF (jatom < katom) THEN
                     offset_bc_set => shm_set_offset(:, :, kkind, jkind)
                  ELSE
                     offset_bc_set => shm_set_offset(:, :, jkind, kkind)
                  END IF
                  IF (iatom < latom) THEN
                     offset_ad_set => shm_set_offset(:, :, lkind, ikind)
                  ELSE
                     offset_ad_set => shm_set_offset(:, :, ikind, lkind)
                  END IF
                  IF (iatom < katom) THEN
                     offset_ac_set => shm_set_offset(:, :, kkind, ikind)
                  ELSE
                     offset_ac_set => shm_set_offset(:, :, ikind, kkind)
                  END IF

                  IF (screen_pmat_forces) THEN
                     swap_id = 16
                     kind_kind_idx = INT(get_1D_idx(kkind, ikind, INT(nkind, int_8)))
                     IF (ikind >= kkind) THEN
                        ptr_p_1 => shm_initial_p(kind_kind_idx)%p_kind(:, :, &
                                                                       actual_x_data%map_atom_to_kind_atom(katom), &
                                                                       actual_x_data%map_atom_to_kind_atom(iatom))
                     ELSE
                        ptr_p_1 => shm_initial_p(kind_kind_idx)%p_kind(:, :, &
                                                                       actual_x_data%map_atom_to_kind_atom(iatom), &
                                                                       actual_x_data%map_atom_to_kind_atom(katom))
                        swap_id = swap_id + 1
                     END IF
                     kind_kind_idx = INT(get_1D_idx(lkind, jkind, INT(nkind, int_8)))
                     IF (jkind >= lkind) THEN
                        ptr_p_2 => shm_initial_p(kind_kind_idx)%p_kind(:, :, &
                                                                       actual_x_data%map_atom_to_kind_atom(latom), &
                                                                       actual_x_data%map_atom_to_kind_atom(jatom))
                     ELSE
                        ptr_p_2 => shm_initial_p(kind_kind_idx)%p_kind(:, :, &
                                                                       actual_x_data%map_atom_to_kind_atom(jatom), &
                                                                       actual_x_data%map_atom_to_kind_atom(latom))
                        swap_id = swap_id + 2
                     END IF
                     kind_kind_idx = INT(get_1D_idx(lkind, ikind, INT(nkind, int_8)))
                     IF (ikind >= lkind) THEN
                        ptr_p_3 => shm_initial_p(kind_kind_idx)%p_kind(:, :, &
                                                                       actual_x_data%map_atom_to_kind_atom(latom), &
                                                                       actual_x_data%map_atom_to_kind_atom(iatom))
                     ELSE
                        ptr_p_3 => shm_initial_p(kind_kind_idx)%p_kind(:, :, &
                                                                       actual_x_data%map_atom_to_kind_atom(iatom), &
                                                                       actual_x_data%map_atom_to_kind_atom(latom))
                        swap_id = swap_id + 4
                     END IF
                     kind_kind_idx = INT(get_1D_idx(kkind, jkind, INT(nkind, int_8)))
                     IF (jkind >= kkind) THEN
                        ptr_p_4 => shm_initial_p(kind_kind_idx)%p_kind(:, :, &
                                                                       actual_x_data%map_atom_to_kind_atom(katom), &
                                                                       actual_x_data%map_atom_to_kind_atom(jatom))
                     ELSE
                        ptr_p_4 => shm_initial_p(kind_kind_idx)%p_kind(:, :, &
                                                                       actual_x_data%map_atom_to_kind_atom(jatom), &
                                                                       actual_x_data%map_atom_to_kind_atom(katom))
                        swap_id = swap_id + 8
                     END IF
                  END IF

                  !! At this stage, check for memory used in compression

                  IF (actual_x_data%memory_parameter%recalc_forces) THEN
                     IF (treat_forces_in_core) THEN
                        ! ** We know the maximum amount of integrals that we can store per MPI-process
                        ! ** Now we need to sum the current memory usage among all openMP threads
                        ! ** We can just read what is currently stored on the corresponding x_data type
                        ! ** If this is thread i and it tries to read the data from thread j, that is
                        ! ** currently writing that data, we just dont care, because the possible error
                        ! ** is of the order of CACHE_SIZE
                        mem_compression_counter = 0
                        DO i = 1, n_threads
!$OMP                   ATOMIC READ
                           tmp_i4 = my_x_data(irep, i)%memory_parameter%actual_memory_usage
                           mem_compression_counter = mem_compression_counter + &
                                                     tmp_i4*memory_parameter%cache_size
                        END DO
                        IF (mem_compression_counter + memory_parameter%final_comp_counter_energy &
                            > memory_parameter%max_compression_counter) THEN
                           buffer_overflow = .TRUE.
                           IF (do_dynamic_load_balancing) THEN
                              distribution_forces%ram_counter = counter
                           ELSE
                              memory_parameter%ram_counter_forces = counter
                           END IF
                        ELSE
                           counter = counter + 1
                           buffer_overflow = .FALSE.
                        END IF
                     END IF
                  ELSE
                     IF (treat_forces_in_core) THEN
                        IF (do_dynamic_load_balancing) THEN
                           IF (distribution_forces%ram_counter == counter) THEN
                              buffer_overflow = .TRUE.
                           ELSE
                              counter = counter + 1
                              buffer_overflow = .FALSE.
                           END IF
                        ELSE
                           IF (memory_parameter%ram_counter_forces == counter) THEN
                              buffer_overflow = .TRUE.
                           ELSE
                              counter = counter + 1
                              buffer_overflow = .FALSE.
                           END IF
                        END IF
                     END IF
                  END IF

                  DO i_set_list_ij = i_set_list_ij_start, i_set_list_ij_stop
                     iset = set_list_ij(i_set_list_ij)%pair(1)
                     jset = set_list_ij(i_set_list_ij)%pair(2)

                     ncob = npgfb(jset)*ncoset(lb_max(jset))
                     max_val1 = screen_coeffs_set(jset, iset, jkind, ikind)%x(1)*rab2 + &
                                screen_coeffs_set(jset, iset, jkind, ikind)%x(2)
                     !! Near field screening
                     IF (max_val1 + (screen_coeffs_kind(lkind, kkind)%x(1)*rcd2 + &
                                     screen_coeffs_kind(lkind, kkind)%x(2)) + pmax_atom < log10_eps_schwarz) CYCLE
                     sphi_a_ext_set => sphi_a_ext(:, :, :, iset)
                     sphi_b_ext_set => sphi_b_ext(:, :, :, jset)

                     DO i_set_list_kl = i_set_list_kl_start, i_set_list_kl_stop
                        kset = set_list_kl(i_set_list_kl)%pair(1)
                        lset = set_list_kl(i_set_list_kl)%pair(2)

                        max_val2_set = (screen_coeffs_set(lset, kset, lkind, kkind)%x(1)*rcd2 + &
                                        screen_coeffs_set(lset, kset, lkind, kkind)%x(2))
                        max_val2 = max_val1 + max_val2_set

                        !! Near field screening
                        IF (max_val2 + pmax_atom < log10_eps_schwarz) CYCLE
                        sphi_c_ext_set => sphi_c_ext(:, :, :, kset)
                        sphi_d_ext_set => sphi_d_ext(:, :, :, lset)

                        IF (screen_pmat_forces) THEN
                           CALL get_pmax_val(ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, &
                                             iset, jset, kset, lset, &
                                             pmax_tmp, swap_id)

                           log10_pmax = log_2 + pmax_tmp
                        ELSE
                           log10_pmax = 0.0_dp
                        END IF

                        max_val2 = max_val2 + log10_pmax
                        IF (max_val2 < log10_eps_schwarz) CYCLE

                        pmax_entry = EXP(log10_pmax*ln_10)
                        !! store current number of integrals, update total number and number of integrals in buffer
                        current_counter = nsgfa(iset)*nsgfb(jset)*nsgfc(kset)*nsgfd(lset)*12
                        IF (buffer_overflow) THEN
                           neris_onthefly = neris_onthefly + current_counter
                        END IF

                        !! Get integrals from buffer and update Kohn-Sham matrix
                        IF (.NOT. buffer_overflow .AND. .NOT. actual_x_data%memory_parameter%recalc_forces) THEN
                           nints = current_counter
                           CALL hfx_get_single_cache_element(estimate_to_store_int, 6, &
                                                             maxval_cache, maxval_container, memory_parameter%actual_memory_usage, &
                                                             use_disk_storage)
                           spherical_estimate = SET_EXPONENT(1.0_dp, estimate_to_store_int + 1)
!                           IF (spherical_estimate*pmax_entry < eps_schwarz) CYCLE
                           IF (.NOT. use_virial) THEN
                              IF (spherical_estimate*pmax_entry < eps_schwarz) CYCLE
                           END IF
                           nbits = EXPONENT(ANINT(spherical_estimate*pmax_entry/eps_storage)) + 1
                           buffer_left = nints
                           buffer_start = 1
                           neris_incore = neris_incore + INT(nints, int_8)
                           DO WHILE (buffer_left > 0)
                              buffer_size = MIN(buffer_left, cache_size)
                              CALL hfx_get_mult_cache_elements(primitive_forces(buffer_start), &
                                                               buffer_size, nbits, &
                                                               integral_caches(nbits), &
                                                               integral_containers(nbits), &
                                                               eps_storage, pmax_entry, &
                                                               memory_parameter%actual_memory_usage, &
                                                               use_disk_storage)
                              buffer_left = buffer_left - buffer_size
                              buffer_start = buffer_start + buffer_size
                           END DO
                        END IF
                        !! Calculate integrals if we run out of buffer or the geometry did change
                        IF (actual_x_data%memory_parameter%recalc_forces .OR. buffer_overflow) THEN
                           max_contraction_val = max_contraction(iset, iatom)* &
                                                 max_contraction(jset, jatom)* &
                                                 max_contraction(kset, katom)* &
                                                 max_contraction(lset, latom)* &
                                                 pmax_entry
                           tmp_R_1 => radii_pgf(:, :, jset, iset, jkind, ikind)
                           tmp_R_2 => radii_pgf(:, :, lset, kset, lkind, kkind)
                           tmp_screen_pgf1 => screen_coeffs_pgf(:, :, jset, iset, jkind, ikind)
                           tmp_screen_pgf2 => screen_coeffs_pgf(:, :, lset, kset, lkind, kkind)

                           CALL forces4(private_deriv, ra, rb, rc, rd, npgfa(iset), npgfb(jset), &
                                        npgfc(kset), npgfd(lset), &
                                        la_min(iset), la_max(iset), lb_min(jset), lb_max(jset), &
                                        lc_min(kset), lc_max(kset), ld_min(lset), ld_max(lset), &
                                        nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
                                        sphi_a_u1, sphi_a_u2, sphi_a_u3, &
                                        sphi_b_u1, sphi_b_u2, sphi_b_u3, &
                                        sphi_c_u1, sphi_c_u2, sphi_c_u3, &
                                        sphi_d_u1, sphi_d_u2, sphi_d_u3, &
                                        zeta(1:npgfa(iset), iset), zetb(1:npgfb(jset), jset), &
                                        zetc(1:npgfc(kset), kset), zetd(1:npgfd(lset), lset), &
                                        primitive_forces, &
                                        potential_parameter, &
                                        actual_x_data%neighbor_cells, &
                                        screen_coeffs_set(jset, iset, jkind, ikind)%x, &
                                        screen_coeffs_set(lset, kset, lkind, kkind)%x, eps_schwarz, &
                                        max_contraction_val, cartesian_estimate, cell, neris_tmp, &
                                        log10_pmax, log10_eps_schwarz, &
                                        tmp_R_1, tmp_R_2, tmp_screen_pgf1, tmp_screen_pgf2, &
                                        pgf_list_ij, pgf_list_kl, pgf_product_list, &
                                        nsgfl_a(:, iset), nsgfl_b(:, jset), &
                                        nsgfl_c(:, kset), nsgfl_d(:, lset), &
                                        sphi_a_ext_set, sphi_b_ext_set, sphi_c_ext_set, sphi_d_ext_set, &
                                        ede_work, ede_work2, ede_work_forces, &
                                        ede_buffer1, ede_buffer2, ede_primitives_tmp, &
                                        nimages, do_periodic, use_virial, ede_work_virial, ede_work2_virial, &
                                        ede_primitives_tmp_virial, primitive_forces_virial, &
                                        cartesian_estimate_virial)

                           nints = nsgfa(iset)*nsgfb(jset)*nsgfc(kset)*nsgfd(lset)*12
                           neris_total = neris_total + nints
                           nprim_ints = nprim_ints + neris_tmp
!                           IF (cartesian_estimate == 0.0_dp) cartesian_estimate = TINY(cartesian_estimate)
!                           estimate_to_store_int = EXPONENT(cartesian_estimate)
!                           estimate_to_store_int = MAX(estimate_to_store_int, -15_int_8)
!                           cartesian_estimate = SET_EXPONENT(1.0_dp, estimate_to_store_int+1)
!                           IF (.NOT. buffer_overflow .AND. actual_x_data%memory_parameter%recalc_forces) THEN
!                              IF (cartesian_estimate < eps_schwarz) THEN
!                                 CALL hfx_add_single_cache_element( &
!                                    estimate_to_store_int, 6, &
!                                    maxval_cache, maxval_container, memory_parameter%actual_memory_usage, &
!                                    use_disk_storage, max_val_memory)
!                              END IF
!                           END IF
!
!                           IF (.NOT. use_virial) THEN
!                              IF (cartesian_estimate < eps_schwarz) CYCLE
!                           END IF

                           !! Compress the array for storage
                           spherical_estimate = 0.0_dp
                           DO i = 1, nints
                              spherical_estimate = MAX(spherical_estimate, ABS(primitive_forces(i)))
                           END DO

                           IF (use_virial) THEN
                              spherical_estimate_virial = 0.0_dp
                              DO i = 1, 3*nints
                                 spherical_estimate_virial = MAX(spherical_estimate_virial, ABS(primitive_forces_virial(i)))
                              END DO
                           END IF

                           IF (spherical_estimate == 0.0_dp) spherical_estimate = TINY(spherical_estimate)
                           estimate_to_store_int = EXPONENT(spherical_estimate)
                           estimate_to_store_int = MAX(estimate_to_store_int, -15_int_8)
                           IF (.NOT. buffer_overflow .AND. actual_x_data%memory_parameter%recalc_forces) THEN
                              CALL hfx_add_single_cache_element(estimate_to_store_int, 6, &
                                                                maxval_cache, maxval_container, &
                                                                memory_parameter%actual_memory_usage, &
                                                                use_disk_storage, max_val_memory)
                           END IF
                           spherical_estimate = SET_EXPONENT(1.0_dp, estimate_to_store_int + 1)
                           IF (.NOT. use_virial) THEN
                              IF (spherical_estimate*pmax_entry < eps_schwarz) CYCLE
                           END IF
                           IF (.NOT. buffer_overflow) THEN
                              nbits = EXPONENT(ANINT(spherical_estimate*pmax_entry/eps_storage)) + 1
                              buffer_left = nints
                              buffer_start = 1
!                              neris_incore = neris_incore+nints
                              neris_incore = neris_incore + INT(nints, int_8)

                              DO WHILE (buffer_left > 0)
                                 buffer_size = MIN(buffer_left, CACHE_SIZE)
                                 CALL hfx_add_mult_cache_elements(primitive_forces(buffer_start), &
                                                                  buffer_size, nbits, &
                                                                  integral_caches(nbits), &
                                                                  integral_containers(nbits), &
                                                                  eps_storage, pmax_entry, &
                                                                  memory_parameter%actual_memory_usage, &
                                                                  use_disk_storage)
                                 buffer_left = buffer_left - buffer_size
                                 buffer_start = buffer_start + buffer_size
                              END DO
                           ELSE
                              !! In order to be consistent with in-core part, round all the eris wrt. eps_schwarz
                              !! but only if we treat forces in-core
                              IF (treat_forces_in_core) THEN
                                 DO i = 1, nints
                                    primitive_forces(i) = primitive_forces(i)*pmax_entry
                                    IF (ABS(primitive_forces(i)) > eps_storage) THEN
                                       primitive_forces(i) = ANINT(primitive_forces(i)/eps_storage, dp)*eps_storage/pmax_entry
                                    ELSE
                                       primitive_forces(i) = 0.0_dp
                                    END IF
                                 END DO
                              END IF
                           END IF
                        END IF
                        IF (with_resp_density) THEN
                           CALL prefetch_density_matrix(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
                                                        full_density_alpha(:, 1), pbd_buf, pbc_buf, pad_buf, pac_buf, &
                                                        iatom, jatom, katom, latom, &
                                                        iset, jset, kset, lset, offset_bd_set, offset_bc_set, &
                                                        offset_ad_set, offset_ac_set, atomic_offset_bd, atomic_offset_bc, &
                                                        atomic_offset_ad, atomic_offset_ac, is_anti_symmetric)
                           CALL prefetch_density_matrix(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
                                                        full_density_resp, pbd_buf_resp, pbc_buf_resp, pad_buf_resp, pac_buf_resp, &
                                                        iatom, jatom, katom, latom, &
                                                        iset, jset, kset, lset, offset_bd_set, offset_bc_set, &
                                                        offset_ad_set, offset_ac_set, atomic_offset_bd, atomic_offset_bc, &
                                                        atomic_offset_ad, atomic_offset_ac, is_anti_symmetric)
                        ELSE
                           CALL prefetch_density_matrix(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
                                                        full_density_alpha(:, 1), pbd_buf, pbc_buf, pad_buf, pac_buf, &
                                                        iatom, jatom, katom, latom, &
                                                        iset, jset, kset, lset, offset_bd_set, offset_bc_set, &
                                                        offset_ad_set, offset_ac_set, atomic_offset_bd, atomic_offset_bc, &
                                                        atomic_offset_ad, atomic_offset_ac, is_anti_symmetric)
                        END IF
                        IF (nspins == 2) THEN
                           IF (with_resp_density) THEN
                              CALL prefetch_density_matrix(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
                                                           full_density_beta(:, 1), pbd_buf_beta, pbc_buf_beta, &
                                                           pad_buf_beta, pac_buf_beta, iatom, jatom, katom, latom, &
                                                           iset, jset, kset, lset, offset_bd_set, offset_bc_set, &
                                                           offset_ad_set, offset_ac_set, atomic_offset_bd, atomic_offset_bc, &
                                                           atomic_offset_ad, atomic_offset_ac, is_anti_symmetric)
                              CALL prefetch_density_matrix(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
                                                           full_density_resp_beta, pbd_buf_resp_beta, pbc_buf_resp_beta, &
                                                           pad_buf_resp_beta, pac_buf_resp_beta, iatom, jatom, katom, latom, &
                                                           iset, jset, kset, lset, offset_bd_set, offset_bc_set, &
                                                           offset_ad_set, offset_ac_set, atomic_offset_bd, atomic_offset_bc, &
                                                           atomic_offset_ad, atomic_offset_ac, is_anti_symmetric)
                           ELSE
                              CALL prefetch_density_matrix(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
                                                           full_density_beta(:, 1), pbd_buf_beta, pbc_buf_beta, pad_buf_beta, &
                                                           pac_buf_beta, iatom, jatom, katom, latom, &
                                                           iset, jset, kset, lset, offset_bd_set, offset_bc_set, &
                                                           offset_ad_set, offset_ac_set, atomic_offset_bd, atomic_offset_bc, &
                                                           atomic_offset_ad, atomic_offset_ac, is_anti_symmetric)
                           END IF
                        END IF
                        IF (spherical_estimate*pmax_entry >= eps_schwarz) THEN
                           DO coord = 1, 12
                              T2 => primitive_forces((coord - 1)*nsgfa(iset)*nsgfb(jset)*nsgfc(kset)*nsgfd(lset) + 1: &
                                                     coord*nsgfa(iset)*nsgfb(jset)*nsgfc(kset)*nsgfd(lset))

                              IF (with_resp_density) THEN
                                 CALL update_forces(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
                                                    pbd_buf, pbc_buf, pad_buf, pac_buf, fac, &
                                                    T2, force, forces_map, coord, &
                                                    pbd_buf_resp, pbc_buf_resp, pad_buf_resp, pac_buf_resp, &
                                                    my_resp_only)
                              ELSE
                                 CALL update_forces(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
                                                    pbd_buf, pbc_buf, pad_buf, pac_buf, fac, &
                                                    T2, force, forces_map, coord)
                              END IF
                              IF (nspins == 2) THEN
                                 IF (with_resp_density) THEN
                                    CALL update_forces(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
                                                       pbd_buf_beta, pbc_buf_beta, pad_buf_beta, pac_buf_beta, &
                                                       fac, T2, force, forces_map, coord, &
                                                       pbd_buf_resp_beta, pbc_buf_resp_beta, &
                                                       pad_buf_resp_beta, pac_buf_resp_beta, &
                                                       my_resp_only)
                                 ELSE
                                    CALL update_forces(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
                                                       pbd_buf_beta, pbc_buf_beta, pad_buf_beta, pac_buf_beta, fac, &
                                                       T2, force, forces_map, coord)
                                 END IF
                              END IF
                           END DO !coord
                        END IF
                        IF (use_virial .AND. spherical_estimate_virial*pmax_entry >= eps_schwarz) THEN
                           DO coord = 1, 12
                              DO i = 1, 3
                                 T2 => primitive_forces_virial( &
                                       ((i - 1)*12 + coord - 1)*nsgfa(iset)*nsgfb(jset)*nsgfc(kset)*nsgfd(lset) + 1: &
                                       ((i - 1)*12 + coord)*nsgfa(iset)*nsgfb(jset)*nsgfc(kset)*nsgfd(lset))
                                 IF (with_resp_density) THEN
                                    CALL update_virial(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
                                                       pbd_buf, pbc_buf, pad_buf, pac_buf, fac, &
                                                       T2, tmp_virial, coord, i, &
                                                       pbd_buf_resp, pbc_buf_resp, pad_buf_resp, &
                                                       pac_buf_resp, my_resp_only)
                                 ELSE
                                    CALL update_virial(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
                                                       pbd_buf, pbc_buf, pad_buf, pac_buf, fac, &
                                                       T2, tmp_virial, coord, i)
                                 END IF
                                 IF (nspins == 2) THEN
                                    IF (with_resp_density) THEN
                                       CALL update_virial(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
                                                          pbd_buf_beta, pbc_buf_beta, pad_buf_beta, pac_buf_beta, &
                                                          fac, T2, tmp_virial, coord, i, &
                                                          pbd_buf_resp_beta, pbc_buf_resp_beta, &
                                                          pad_buf_resp_beta, pac_buf_resp_beta, &
                                                          my_resp_only)
                                    ELSE
                                       CALL update_virial(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
                                                          pbd_buf_beta, pbc_buf_beta, pad_buf_beta, pac_buf_beta, &
                                                          fac, T2, tmp_virial, coord, i)
                                    END IF
                                 END IF
                              END DO
                           END DO !coord
                        END IF

                     END DO !i_set_list_kl
                  END DO !i_set_list_ij
               END DO !i_list_kl
            END DO !i_list_ij
         END DO atomic_blocks
         bintime_stop = m_walltime()
         distribution_forces%time_forces = bintime_stop - bintime_start
!$OMP MASTER
         CALL timestop(handle_bin)
!$OMP END MASTER
      END DO !bin

!$OMP MASTER
      logger => cp_get_default_logger()
      do_print_load_balance_info = .FALSE.
      do_print_load_balance_info = BTEST(cp_print_key_should_output(logger%iter_info, hfx_section, &
                                                                    "LOAD_BALANCE%PRINT/LOAD_BALANCE_INFO"), cp_p_file)
!$OMP END MASTER
!$OMP BARRIER
      IF (do_print_load_balance_info) THEN
         iw = -1
!$OMP MASTER
         iw = cp_print_key_unit_nr(logger, hfx_section, "LOAD_BALANCE%PRINT/LOAD_BALANCE_INFO", &
                                   extension=".scfLog")
!$OMP END MASTER

         CALL collect_load_balance_info(para_env, actual_x_data, iw, n_threads, i_thread, &
                                        hfx_do_eval_forces)

!$OMP MASTER
         CALL cp_print_key_finished_output(iw, logger, hfx_section, &
                                           "LOAD_BALANCE%PRINT/LOAD_BALANCE_INFO")
!$OMP END MASTER
      END IF

      IF (use_virial) THEN
         DO i = 1, 3
            DO j = 1, 3
               DO k = 1, 3
!$OMP                 ATOMIC
                  virial%pv_fock_4c(i, j) = virial%pv_fock_4c(i, j) + tmp_virial(i, k)*cell%hmat(j, k)
               END DO
            END DO
         END DO
      END IF

!$OMP BARRIER
      !! Get some number about ERIS
!$OMP ATOMIC
      shm_neris_total = shm_neris_total + neris_total
!$OMP ATOMIC
      shm_neris_onthefly = shm_neris_onthefly + neris_onthefly
!$OMP ATOMIC
      shm_nprim_ints = shm_nprim_ints + nprim_ints

      storage_counter_integrals = memory_parameter%actual_memory_usage* &
                                  memory_parameter%cache_size
      stor_count_max_val = max_val_memory*memory_parameter%cache_size
!$OMP ATOMIC
      shm_storage_counter_integrals = shm_storage_counter_integrals + storage_counter_integrals
!$OMP ATOMIC
      shm_neris_incore = shm_neris_incore + neris_incore
!$OMP ATOMIC
      shm_stor_count_max_val = shm_stor_count_max_val + stor_count_max_val
!$OMP BARRIER

      ! IF(with_resp_density) THEN
      !   ! restore original load_balance_parameter
      !   actual_x_data%load_balance_parameter = load_balance_parameter_energy
      !   DEALLOCATE(load_balance_parameter_energy)
      ! END IF

!$OMP MASTER
      !! Print some memeory information if this is the first step
      IF (actual_x_data%memory_parameter%recalc_forces) THEN
         tmp_i8(1:6) = (/shm_storage_counter_integrals, shm_neris_onthefly, shm_neris_incore, &
                         shm_neris_total, shm_nprim_ints, shm_stor_count_max_val/)
         CALL para_env%sum(tmp_i8)
         shm_storage_counter_integrals = tmp_i8(1)
         shm_neris_onthefly = tmp_i8(2)
         shm_neris_incore = tmp_i8(3)
         shm_neris_total = tmp_i8(4)
         shm_nprim_ints = tmp_i8(5)
         shm_stor_count_max_val = tmp_i8(6)
         mem_eris = (shm_storage_counter_integrals + 128*1024 - 1)/1024/128
         compression_factor = REAL(shm_neris_incore, dp)/REAL(shm_storage_counter_integrals, dp)
         mem_max_val = (shm_stor_count_max_val + 128*1024 - 1)/1024/128

         IF (shm_neris_incore == 0) THEN
            mem_eris = 0
            compression_factor = 0.0_dp
         END IF

         logger => cp_get_default_logger()

         iw = cp_print_key_unit_nr(logger, hfx_section, "HF_INFO", &
                                   extension=".scfLog")
         IF (iw > 0) THEN
            WRITE (UNIT=iw, FMT="(/,(T3,A,T65,I16))") &
               "HFX_MEM_INFO| Number of cart. primitive DERIV's calculated:      ", shm_nprim_ints

            WRITE (UNIT=iw, FMT="((T3,A,T60,I21))") &
               "HFX_MEM_INFO| Number of sph. DERIV's calculated:           ", shm_neris_total

            WRITE (UNIT=iw, FMT="((T3,A,T60,I21))") &
               "HFX_MEM_INFO| Number of sph. DERIV's stored in-core:        ", shm_neris_incore

            WRITE (UNIT=iw, FMT="((T3,A,T62,I19))") &
               "HFX_MEM_INFO| Number of sph. DERIV's calculated on the fly: ", shm_neris_onthefly

            WRITE (UNIT=iw, FMT="((T3,A,T60,I21))") &
               "HFX_MEM_INFO| Total memory consumption DERIV's RAM [MiB]:            ", mem_eris

            WRITE (UNIT=iw, FMT="((T3,A,T60,I21))") &
               "HFX_MEM_INFO| Whereof max-vals [MiB]:            ", mem_max_val

            WRITE (UNIT=iw, FMT="((T3,A,T60,F21.2),/)") &
               "HFX_MEM_INFO| Total compression factor DERIV's RAM:                  ", compression_factor
         END IF

         CALL cp_print_key_finished_output(iw, logger, hfx_section, &
                                           "HF_INFO")
      END IF
!$OMP END MASTER

      !! flush caches if the geometry changed
      IF (do_dynamic_load_balancing) THEN
         my_bin_size = SIZE(actual_x_data%distribution_forces)
      ELSE
         my_bin_size = 1
      END IF

      IF (actual_x_data%memory_parameter%recalc_forces) THEN
         IF (treat_forces_in_core) THEN
            DO bin = 1, my_bin_size
               maxval_cache => actual_x_data%store_forces%maxval_cache(bin)
               maxval_container => actual_x_data%store_forces%maxval_container(bin)
               integral_caches => actual_x_data%store_forces%integral_caches(:, bin)
               integral_containers => actual_x_data%store_forces%integral_containers(:, bin)
               CALL hfx_flush_last_cache(bits_max_val, maxval_cache, maxval_container, memory_parameter%actual_memory_usage, &
                                         .FALSE.)
               DO i = 1, 64
                  CALL hfx_flush_last_cache(i, integral_caches(i), integral_containers(i), &
                                            memory_parameter%actual_memory_usage, .FALSE.)
               END DO
            END DO
         END IF
      END IF
      !! reset all caches except we calculate all on the fly
      IF (treat_forces_in_core) THEN
         DO bin = 1, my_bin_size
            maxval_cache => actual_x_data%store_forces%maxval_cache(bin)
            maxval_container => actual_x_data%store_forces%maxval_container(bin)
            integral_caches => actual_x_data%store_forces%integral_caches(:, bin)
            integral_containers => actual_x_data%store_forces%integral_containers(:, bin)

            CALL hfx_reset_cache_and_container(maxval_cache, maxval_container, memory_parameter%actual_memory_usage, .FALSE.)
            DO i = 1, 64
               CALL hfx_reset_cache_and_container(integral_caches(i), integral_containers(i), &
                                                  memory_parameter%actual_memory_usage, &
                                                  .FALSE.)
            END DO
         END DO
      END IF

      IF (actual_x_data%memory_parameter%recalc_forces) THEN
         actual_x_data%memory_parameter%recalc_forces = .FALSE.
      END IF

!$OMP BARRIER
!$OMP MASTER
      CALL timestop(handle_main)
!$OMP END MASTER
!$OMP BARRIER
      DEALLOCATE (last_sgf_global)
!$OMP MASTER
      DEALLOCATE (full_density_alpha)
      IF (with_resp_density) THEN
         DEALLOCATE (full_density_resp)
      END IF
      IF (nspins == 2) THEN
         DEALLOCATE (full_density_beta)
         IF (with_resp_density) THEN
            DEALLOCATE (full_density_resp_beta)
         END IF
      END IF
      IF (do_dynamic_load_balancing) THEN
         DEALLOCATE (actual_x_data%task_list)
      END IF
!$OMP END MASTER
      DEALLOCATE (primitive_forces)
      DEALLOCATE (atom_of_kind, kind_of)
      DEALLOCATE (max_contraction)
      DEALLOCATE (pbd_buf)
      DEALLOCATE (pbc_buf)
      DEALLOCATE (pad_buf)
      DEALLOCATE (pac_buf)

      IF (with_resp_density) THEN
         DEALLOCATE (pbd_buf_resp)
         DEALLOCATE (pbc_buf_resp)
         DEALLOCATE (pad_buf_resp)
         DEALLOCATE (pac_buf_resp)
      END IF

      DO i = 1, max_pgf**2
         DEALLOCATE (pgf_list_ij(i)%image_list)
         DEALLOCATE (pgf_list_kl(i)%image_list)
      END DO

      DEALLOCATE (pgf_list_ij)
      DEALLOCATE (pgf_list_kl)
      DEALLOCATE (pgf_product_list)

      DEALLOCATE (set_list_ij, set_list_kl)

      DEALLOCATE (primitive_forces_virial)

      DEALLOCATE (ede_work, ede_work2, ede_work_forces, ede_buffer1, ede_buffer2, ede_primitives_tmp)

      DEALLOCATE (ede_work_virial, ede_work2_virial, ede_primitives_tmp_virial)

      DEALLOCATE (nimages)

      IF (nspins == 2) THEN
         DEALLOCATE (pbd_buf_beta, pbc_buf_beta, pad_buf_beta, pac_buf_beta)
         IF (with_resp_density) THEN
            DEALLOCATE (pbd_buf_resp_beta)
            DEALLOCATE (pbc_buf_resp_beta)
            DEALLOCATE (pad_buf_resp_beta)
            DEALLOCATE (pac_buf_resp_beta)
         END IF
      END IF

      !
      ! this wraps to free_libint, but currently causes a segfault
      ! as a result, we don't call it, but some memory remains allocated
      !
      ! CALL terminate_libderiv(private_deriv)

!$OMP END PARALLEL

      CALL timestop(handle)

   END SUBROUTINE derivatives_four_center

! **************************************************************************************************
!> \brief ...
!> \param deriv ...
!> \param ra ...
!> \param rb ...
!> \param rc ...
!> \param rd ...
!> \param npgfa ...
!> \param npgfb ...
!> \param npgfc ...
!> \param npgfd ...
!> \param la_min ...
!> \param la_max ...
!> \param lb_min ...
!> \param lb_max ...
!> \param lc_min ...
!> \param lc_max ...
!> \param ld_min ...
!> \param ld_max ...
!> \param nsgfa ...
!> \param nsgfb ...
!> \param nsgfc ...
!> \param nsgfd ...
!> \param sphi_a_u1 ...
!> \param sphi_a_u2 ...
!> \param sphi_a_u3 ...
!> \param sphi_b_u1 ...
!> \param sphi_b_u2 ...
!> \param sphi_b_u3 ...
!> \param sphi_c_u1 ...
!> \param sphi_c_u2 ...
!> \param sphi_c_u3 ...
!> \param sphi_d_u1 ...
!> \param sphi_d_u2 ...
!> \param sphi_d_u3 ...
!> \param zeta ...
!> \param zetb ...
!> \param zetc ...
!> \param zetd ...
!> \param primitive_integrals ...
!> \param potential_parameter ...
!> \param neighbor_cells ...
!> \param screen1 ...
!> \param screen2 ...
!> \param eps_schwarz ...
!> \param max_contraction_val ...
!> \param cart_estimate ...
!> \param cell ...
!> \param neris_tmp ...
!> \param log10_pmax ...
!> \param log10_eps_schwarz ...
!> \param R1_pgf ...
!> \param R2_pgf ...
!> \param pgf1 ...
!> \param pgf2 ...
!> \param pgf_list_ij ...
!> \param pgf_list_kl ...
!> \param pgf_product_list ...
!> \param nsgfl_a ...
!> \param nsgfl_b ...
!> \param nsgfl_c ...
!> \param nsgfl_d ...
!> \param sphi_a_ext ...
!> \param sphi_b_ext ...
!> \param sphi_c_ext ...
!> \param sphi_d_ext ...
!> \param ede_work ...
!> \param ede_work2 ...
!> \param ede_work_forces ...
!> \param ede_buffer1 ...
!> \param ede_buffer2 ...
!> \param ede_primitives_tmp ...
!> \param nimages ...
!> \param do_periodic ...
!> \param use_virial ...
!> \param ede_work_virial ...
!> \param ede_work2_virial ...
!> \param ede_primitives_tmp_virial ...
!> \param primitive_integrals_virial ...
!> \param cart_estimate_virial ...
! **************************************************************************************************
   SUBROUTINE forces4(deriv, ra, rb, rc, rd, npgfa, npgfb, npgfc, npgfd, &
                      la_min, la_max, lb_min, lb_max, &
                      lc_min, lc_max, ld_min, ld_max, nsgfa, nsgfb, nsgfc, nsgfd, &
                      sphi_a_u1, sphi_a_u2, sphi_a_u3, &
                      sphi_b_u1, sphi_b_u2, sphi_b_u3, &
                      sphi_c_u1, sphi_c_u2, sphi_c_u3, &
                      sphi_d_u1, sphi_d_u2, sphi_d_u3, &
                      zeta, zetb, zetc, zetd, &
                      primitive_integrals, &
                      potential_parameter, neighbor_cells, &
                      screen1, screen2, eps_schwarz, max_contraction_val, &
                      cart_estimate, cell, neris_tmp, log10_pmax, &
                      log10_eps_schwarz, R1_pgf, R2_pgf, pgf1, pgf2, &
                      pgf_list_ij, pgf_list_kl, &
                      pgf_product_list, &
                      nsgfl_a, nsgfl_b, nsgfl_c, &
                      nsgfl_d, &
                      sphi_a_ext, sphi_b_ext, sphi_c_ext, sphi_d_ext, &
                      ede_work, ede_work2, ede_work_forces, &
                      ede_buffer1, ede_buffer2, ede_primitives_tmp, &
                      nimages, do_periodic, use_virial, ede_work_virial, &
                      ede_work2_virial, ede_primitives_tmp_virial, primitive_integrals_virial, &
                      cart_estimate_virial)

      TYPE(cp_libint_t)                                  :: deriv
      REAL(dp), INTENT(IN)                               :: ra(3), rb(3), rc(3), rd(3)
      INTEGER, INTENT(IN) :: npgfa, npgfb, npgfc, npgfd, la_min, la_max, lb_min, lb_max, lc_min, &
                             lc_max, ld_min, ld_max, nsgfa, nsgfb, nsgfc, nsgfd, sphi_a_u1, sphi_a_u2, sphi_a_u3, &
                             sphi_b_u1, sphi_b_u2, sphi_b_u3, sphi_c_u1, sphi_c_u2, sphi_c_u3, sphi_d_u1, sphi_d_u2, &
                             sphi_d_u3
      REAL(dp), DIMENSION(1:npgfa), INTENT(IN)           :: zeta
      REAL(dp), DIMENSION(1:npgfb), INTENT(IN)           :: zetb
      REAL(dp), DIMENSION(1:npgfc), INTENT(IN)           :: zetc
      REAL(dp), DIMENSION(1:npgfd), INTENT(IN)           :: zetd
      REAL(dp), &
         DIMENSION(nsgfa, nsgfb, nsgfc, nsgfd, 12)       :: primitive_integrals
      TYPE(hfx_potential_type)                           :: potential_parameter
      TYPE(hfx_cell_type), DIMENSION(:), POINTER         :: neighbor_cells
      REAL(dp), INTENT(IN)                               :: screen1(2), screen2(2), eps_schwarz, &
                                                            max_contraction_val
      REAL(dp)                                           :: cart_estimate
      TYPE(cell_type), POINTER                           :: cell
      INTEGER(int_8)                                     :: neris_tmp
      REAL(dp), INTENT(IN)                               :: log10_pmax, log10_eps_schwarz
      TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
         POINTER                                         :: R1_pgf, R2_pgf, pgf1, pgf2
      TYPE(hfx_pgf_list), DIMENSION(*)                   :: pgf_list_ij, pgf_list_kl
      TYPE(hfx_pgf_product_list), ALLOCATABLE, &
         DIMENSION(:), INTENT(INOUT)                     :: pgf_product_list
      INTEGER, DIMENSION(0:), INTENT(IN)                 :: nsgfl_a, nsgfl_b, nsgfl_c, nsgfl_d
      REAL(dp), INTENT(IN) :: sphi_a_ext(sphi_a_u1, sphi_a_u2, sphi_a_u3), &
                              sphi_b_ext(sphi_b_u1, sphi_b_u2, sphi_b_u3), sphi_c_ext(sphi_c_u1, sphi_c_u2, sphi_c_u3), &
                              sphi_d_ext(sphi_d_u1, sphi_d_u2, sphi_d_u3)
      REAL(dp), DIMENSION(*)                             :: ede_work, ede_work2, ede_work_forces, &
                                                            ede_buffer1, ede_buffer2, &
                                                            ede_primitives_tmp
      INTEGER, DIMENSION(*)                              :: nimages
      LOGICAL, INTENT(IN)                                :: do_periodic, use_virial
      REAL(dp), DIMENSION(*)                             :: ede_work_virial, ede_work2_virial, &
                                                            ede_primitives_tmp_virial
      REAL(dp), &
         DIMENSION(nsgfa, nsgfb, nsgfc, nsgfd, 12, 3)    :: primitive_integrals_virial
      REAL(dp)                                           :: cart_estimate_virial

      INTEGER :: ipgf, jpgf, kpgf, la, lb, lc, ld, list_ij, list_kl, lpgf, max_l, ncoa, ncob, &
                 ncoc, ncod, nelements_ij, nelements_kl, nproducts, nsgfla, nsgflb, nsgflc, nsgfld, nsoa, &
                 nsob, nsoc, nsod, s_offset_a, s_offset_b, s_offset_c, s_offset_d
      REAL(dp)                                           :: EtaInv, tmp_max, tmp_max_virial, Zeta_A, &
                                                            Zeta_B, Zeta_C, Zeta_D, ZetaInv

      CALL build_pair_list_pgf(npgfa, npgfb, pgf_list_ij, zeta, zetb, screen1, screen2, &
                               pgf1, R1_pgf, log10_pmax, log10_eps_schwarz, ra, rb, &
                               nelements_ij, &
                               neighbor_cells, nimages, do_periodic)
      CALL build_pair_list_pgf(npgfc, npgfd, pgf_list_kl, zetc, zetd, screen2, screen1, &
                               pgf2, R2_pgf, log10_pmax, log10_eps_schwarz, rc, rd, &
                               nelements_kl, &
                               neighbor_cells, nimages, do_periodic)

      cart_estimate = 0.0_dp
      primitive_integrals = 0.0_dp
      IF (use_virial) THEN
         primitive_integrals_virial = 0.0_dp
         cart_estimate_virial = 0.0_dp
      END IF
      neris_tmp = 0_int_8
      max_l = la_max + lb_max + lc_max + ld_max + 1
      DO list_ij = 1, nelements_ij
         Zeta_A = pgf_list_ij(list_ij)%zeta
         Zeta_B = pgf_list_ij(list_ij)%zetb
         ZetaInv = pgf_list_ij(list_ij)%ZetaInv
         ipgf = pgf_list_ij(list_ij)%ipgf
         jpgf = pgf_list_ij(list_ij)%jpgf

         DO list_kl = 1, nelements_kl
            Zeta_C = pgf_list_kl(list_kl)%zeta
            Zeta_D = pgf_list_kl(list_kl)%zetb
            EtaInv = pgf_list_kl(list_kl)%ZetaInv
            kpgf = pgf_list_kl(list_kl)%ipgf
            lpgf = pgf_list_kl(list_kl)%jpgf

            CALL build_pgf_product_list(pgf_list_ij(list_ij), pgf_list_kl(list_kl), pgf_product_list, &
                                        nproducts, log10_pmax, log10_eps_schwarz, neighbor_cells, cell, &
                                        potential_parameter, max_l, do_periodic)
            s_offset_a = 0
            DO la = la_min, la_max
               s_offset_b = 0
               ncoa = nco(la)
               nsgfla = nsgfl_a(la)
               nsoa = nso(la)

               DO lb = lb_min, lb_max
                  s_offset_c = 0
                  ncob = nco(lb)
                  nsgflb = nsgfl_b(lb)
                  nsob = nso(lb)

                  DO lc = lc_min, lc_max
                     s_offset_d = 0
                     ncoc = nco(lc)
                     nsgflc = nsgfl_c(lc)
                     nsoc = nso(lc)

                     DO ld = ld_min, ld_max
                        ncod = nco(ld)
                        nsgfld = nsgfl_d(ld)
                        nsod = nso(ld)
                        tmp_max = 0.0_dp
                        tmp_max_virial = 0.0_dp
                        CALL evaluate_deriv_eri(deriv, nproducts, pgf_product_list, &
                                                la, lb, lc, ld, &
                                                ncoa, ncob, ncoc, ncod, &
                                                nsgfa, nsgfb, nsgfc, nsgfd, &
                                                primitive_integrals, &
                                                max_contraction_val, tmp_max, eps_schwarz, neris_tmp, &
                                                Zeta_A, Zeta_B, Zeta_C, Zeta_D, ZetaInv, EtaInv, &
                                                s_offset_a, s_offset_b, s_offset_c, s_offset_d, &
                                                nsgfla, nsgflb, nsgflc, nsgfld, nsoa, nsob, nsoc, nsod, &
                                                sphi_a_ext(1, la + 1, ipgf), &
                                                sphi_b_ext(1, lb + 1, jpgf), &
                                                sphi_c_ext(1, lc + 1, kpgf), &
                                                sphi_d_ext(1, ld + 1, lpgf), &
                                                ede_work, ede_work2, ede_work_forces, &
                                                ede_buffer1, ede_buffer2, ede_primitives_tmp, use_virial, &
                                                ede_work_virial, ede_work2_virial, ede_primitives_tmp_virial, &
                                                primitive_integrals_virial, cell, tmp_max_virial)
                        cart_estimate = MAX(tmp_max, cart_estimate)
                        IF (use_virial) cart_estimate_virial = MAX(tmp_max_virial, cart_estimate_virial)
                        s_offset_d = s_offset_d + nsod*nsgfld
                     END DO !ld
                     s_offset_c = s_offset_c + nsoc*nsgflc
                  END DO !lc
                  s_offset_b = s_offset_b + nsob*nsgflb
               END DO !lb
               s_offset_a = s_offset_a + nsoa*nsgfla
            END DO !la
         END DO
      END DO

   END SUBROUTINE forces4

! **************************************************************************************************
!> \brief Given a 2d index pair, this function returns a 1d index pair for
!>        a symmetric upper triangle NxN matrix
!>        The compiler should inline this function, therefore it appears in
!>        several modules
!> \param i 2d index
!> \param j 2d index
!> \param N matrix size
!> \return ...
!> \par History
!>      03.2009 created [Manuel Guidon]
!> \author Manuel Guidon
! **************************************************************************************************
   PURE FUNCTION get_1D_idx(i, j, N)
      INTEGER, INTENT(IN)                                :: i, j
      INTEGER(int_8), INTENT(IN)                         :: N
      INTEGER(int_8)                                     :: get_1D_idx

      INTEGER(int_8)                                     :: min_ij

      min_ij = MIN(i, j)
      get_1D_idx = min_ij*N + MAX(i, j) - (min_ij - 1)*min_ij/2 - N

   END FUNCTION get_1D_idx

! **************************************************************************************************
!> \brief This routine prefetches density matrix elements, i.e. reshuffles the
!>        data in a way that they can be accessed later on in a cache friendly
!>        way
!> \param ma_max Size of matrix blocks
!> \param mb_max Size of matrix blocks
!> \param mc_max Size of matrix blocks
!> \param md_max Size of matrix blocks
!> \param density ...
!> \param pbd buffer that will contain P(b,d)
!> \param pbc buffer that will contain P(b,c)
!> \param pad buffer that will contain P(a,d)
!> \param pac buffer that will contain P(a,c)
!> \param iatom ...
!> \param jatom ...
!> \param katom ...
!> \param latom ...
!> \param iset ...
!> \param jset ...
!> \param kset ...
!> \param lset ...
!> \param offset_bd_set ...
!> \param offset_bc_set ...
!> \param offset_ad_set ...
!> \param offset_ac_set ...
!> \param atomic_offset_bd ...
!> \param atomic_offset_bc ...
!> \param atomic_offset_ad ...
!> \param atomic_offset_ac ...
!> \param anti_symmetric ...
!> \par History
!>      03.2009 created [Manuel Guidon]
!> \author Manuel Guidon
! **************************************************************************************************
   SUBROUTINE prefetch_density_matrix(ma_max, mb_max, mc_max, md_max, &
                                      density, pbd, pbc, pad, pac, &
                                      iatom, jatom, katom, latom, &
                                      iset, jset, kset, lset, offset_bd_set, offset_bc_set, &
                                      offset_ad_set, offset_ac_set, atomic_offset_bd, atomic_offset_bc, &
                                      atomic_offset_ad, atomic_offset_ac, anti_symmetric)

      INTEGER, INTENT(IN)                                :: ma_max, mb_max, mc_max, md_max
      REAL(dp), DIMENSION(:), INTENT(IN)                 :: density
      REAL(dp), DIMENSION(*), INTENT(INOUT)              :: pbd, pbc, pad, pac
      INTEGER, INTENT(IN)                                :: iatom, jatom, katom, latom, iset, jset, &
                                                            kset, lset
      INTEGER, DIMENSION(:, :), POINTER                  :: offset_bd_set, offset_bc_set, &
                                                            offset_ad_set, offset_ac_set
      INTEGER, INTENT(IN)                                :: atomic_offset_bd, atomic_offset_bc, &
                                                            atomic_offset_ad, atomic_offset_ac
      LOGICAL                                            :: anti_symmetric

      INTEGER                                            :: i, j, ma, mb, mc, md, offset_ac, &
                                                            offset_ad, offset_bc, offset_bd
      REAL(dp)                                           :: fac

      fac = 1.0_dp
      IF (anti_symmetric) fac = -1.0_dp
      IF (jatom >= latom) THEN
         i = 1
         offset_bd = offset_bd_set(jset, lset) + atomic_offset_bd - 1
         j = offset_bd
         DO md = 1, md_max
            DO mb = 1, mb_max
               pbd(i) = density(j)*fac
               i = i + 1
               j = j + 1
            END DO
         END DO
      ELSE
         i = 1
         offset_bd = offset_bd_set(lset, jset) + atomic_offset_bd - 1
         DO md = 1, md_max
            j = offset_bd + md - 1
            DO mb = 1, mb_max
               pbd(i) = density(j)
               i = i + 1
               j = j + md_max
            END DO
         END DO
      END IF
      IF (jatom >= katom) THEN
         i = 1
         offset_bc = offset_bc_set(jset, kset) + atomic_offset_bc - 1
         j = offset_bc
         DO mc = 1, mc_max
            DO mb = 1, mb_max
               pbc(i) = density(j)*fac
               i = i + 1
               j = j + 1
            END DO
         END DO
      ELSE
         i = 1
         offset_bc = offset_bc_set(kset, jset) + atomic_offset_bc - 1
         DO mc = 1, mc_max
            j = offset_bc + mc - 1
            DO mb = 1, mb_max
               pbc(i) = density(j)
               i = i + 1
               j = j + mc_max
            END DO
         END DO
      END IF
      IF (iatom >= latom) THEN
         i = 1
         offset_ad = offset_ad_set(iset, lset) + atomic_offset_ad - 1
         j = offset_ad
         DO md = 1, md_max
            DO ma = 1, ma_max
               pad(i) = density(j)*fac
               i = i + 1
               j = j + 1
            END DO
         END DO
      ELSE
         i = 1
         offset_ad = offset_ad_set(lset, iset) + atomic_offset_ad - 1
         DO md = 1, md_max
            j = offset_ad + md - 1
            DO ma = 1, ma_max
               pad(i) = density(j)
               i = i + 1
               j = j + md_max
            END DO
         END DO
      END IF
      IF (iatom >= katom) THEN
         i = 1
         offset_ac = offset_ac_set(iset, kset) + atomic_offset_ac - 1
         j = offset_ac
         DO mc = 1, mc_max
            DO ma = 1, ma_max
               pac(i) = density(j)*fac
               i = i + 1
               j = j + 1
            END DO
         END DO
      ELSE
         i = 1
         offset_ac = offset_ac_set(kset, iset) + atomic_offset_ac - 1
         DO mc = 1, mc_max
            j = offset_ac + mc - 1
            DO ma = 1, ma_max
               pac(i) = density(j)
               i = i + 1
               j = j + mc_max
            END DO
         END DO
      END IF
   END SUBROUTINE prefetch_density_matrix

! **************************************************************************************************
!> \brief This routine updates the forces using buffered density matrices
!> \param ma_max Size of matrix blocks
!> \param mb_max Size of matrix blocks
!> \param mc_max Size of matrix blocks
!> \param md_max Size of matrix blocks
!> \param pbd buffer that will contain P(b,d)
!> \param pbc buffer that will contain P(b,c)
!> \param pad buffer that will contain P(a,d)
!> \param pac buffer that will contain P(a,c)
!> \param fac mulitplication factor (spin, symmetry)
!> \param prim primitive forces
!> \param force storage loacation for forces
!> \param forces_map index table
!> \param coord which of the 12 coords to be updated
!> \param pbd_resp ...
!> \param pbc_resp ...
!> \param pad_resp ...
!> \param pac_resp ...
!> \param resp_only ...
!> \par History
!>      03.2009 created [Manuel Guidon]
!> \author Manuel Guidon
! **************************************************************************************************
   SUBROUTINE update_forces(ma_max, mb_max, mc_max, md_max, &
                            pbd, pbc, pad, pac, fac, &
                            prim, force, forces_map, coord, &
                            pbd_resp, pbc_resp, pad_resp, pac_resp, resp_only)

      INTEGER, INTENT(IN)                                :: ma_max, mb_max, mc_max, md_max
      REAL(dp), DIMENSION(*), INTENT(IN)                 :: pbd, pbc, pad, pac
      REAL(dp), INTENT(IN)                               :: fac
      REAL(dp), DIMENSION(ma_max*mb_max*mc_max*md_max), &
         INTENT(IN)                                      :: prim
      TYPE(qs_force_type), DIMENSION(:), POINTER         :: force
      INTEGER, INTENT(IN)                                :: forces_map(4, 2), coord
      REAL(dp), DIMENSION(*), INTENT(IN), OPTIONAL       :: pbd_resp, pbc_resp, pad_resp, pac_resp
      LOGICAL, INTENT(IN), OPTIONAL                      :: resp_only

      INTEGER                                            :: ma, mb, mc, md, p_index
      LOGICAL                                            :: full_force, with_resp_density
      REAL(dp)                                           :: temp1, temp1_resp, temp3, temp3_resp, &
                                                            temp4, teresp

      with_resp_density = .FALSE.
      IF (PRESENT(pbd_resp) .AND. &
          PRESENT(pbc_resp) .AND. &
          PRESENT(pad_resp) .AND. &
          PRESENT(pac_resp)) with_resp_density = .TRUE.

      IF (with_resp_density) THEN
         full_force = .TRUE.
         IF (PRESENT(resp_only)) full_force = .NOT. resp_only
         p_index = 0
         temp4 = 0.0_dp
         DO md = 1, md_max
            DO mc = 1, mc_max
               DO mb = 1, mb_max
                  temp1 = pbc((mc - 1)*mb_max + mb)*fac
                  temp3 = pbd((md - 1)*mb_max + mb)*fac
                  temp1_resp = pbc_resp((mc - 1)*mb_max + mb)*fac
                  temp3_resp = pbd_resp((md - 1)*mb_max + mb)*fac
                  DO ma = 1, ma_max
                     p_index = p_index + 1
                     ! HF-SCF
                     IF (full_force) THEN
                        teresp = temp1*pad((md - 1)*ma_max + ma) + &
                                 temp3*pac((mc - 1)*ma_max + ma)
                     ELSE
                        teresp = 0.0_dp
                     END IF
                     ! RESP+HF
                     teresp = teresp + &
                              pac((mc - 1)*ma_max + ma)*temp3_resp + &
                              pac_resp((mc - 1)*ma_max + ma)*temp3 + &
                              pad((md - 1)*ma_max + ma)*temp1_resp + &
                              pad_resp((md - 1)*ma_max + ma)*temp1
                     temp4 = temp4 + teresp*prim(p_index)
                  END DO !ma
               END DO !mb
            END DO !mc
         END DO !md
      ELSE
         p_index = 0
         temp4 = 0.0_dp
         DO md = 1, md_max
            DO mc = 1, mc_max
               DO mb = 1, mb_max
                  temp1 = pbc((mc - 1)*mb_max + mb)*fac
                  temp3 = pbd((md - 1)*mb_max + mb)*fac
                  DO ma = 1, ma_max
                     p_index = p_index + 1
                     teresp = temp1*pad((md - 1)*ma_max + ma) + &
                              temp3*pac((mc - 1)*ma_max + ma)
                     temp4 = temp4 + teresp*prim(p_index)
                  END DO !ma
               END DO !mb
            END DO !mc
         END DO !md
      END IF

!$OMP ATOMIC
      force(forces_map((coord - 1)/3 + 1, 1))%fock_4c(MOD(coord - 1, 3) + 1, &
                                                      forces_map((coord - 1)/3 + 1, 2)) = &
         force(forces_map((coord - 1)/3 + 1, 1))%fock_4c(MOD(coord - 1, 3) + 1, &
                                                         forces_map((coord - 1)/3 + 1, 2)) - &
         temp4

   END SUBROUTINE update_forces

! **************************************************************************************************
!> \brief This routine updates the virial using buffered density matrices
!> \param ma_max Size of matrix blocks
!> \param mb_max Size of matrix blocks
!> \param mc_max Size of matrix blocks
!> \param md_max Size of matrix blocks
!> \param pbd buffer that will contain P(b,d)
!> \param pbc buffer that will contain P(b,c)
!> \param pad buffer that will contain P(a,d)
!> \param pac buffer that will contain P(a,c)
!> \param fac mulitplication factor (spin, symmetry)
!> \param prim primitive forces
!> \param tmp_virial ...
!> \param coord which of the 12 coords to be updated
!> \param l ...
!> \param pbd_resp ...
!> \param pbc_resp ...
!> \param pad_resp ...
!> \param pac_resp ...
!> \par History
!>      03.2009 created [Manuel Guidon]
!> \author Manuel Guidon
! **************************************************************************************************
   SUBROUTINE update_virial(ma_max, mb_max, mc_max, md_max, &
                            pbd, pbc, pad, pac, fac, &
                            prim, tmp_virial, coord, l, &
                            pbd_resp, pbc_resp, pad_resp, pac_resp, resp_only)

      INTEGER, INTENT(IN)                                :: ma_max, mb_max, mc_max, md_max
      REAL(dp), DIMENSION(*), INTENT(IN)                 :: pbd, pbc, pad, pac
      REAL(dp), INTENT(IN)                               :: fac
      REAL(dp), DIMENSION(ma_max*mb_max*mc_max*md_max), &
         INTENT(IN)                                      :: prim
      REAL(dp)                                           :: tmp_virial(3, 3)
      INTEGER, INTENT(IN)                                :: coord, l
      REAL(dp), DIMENSION(*), INTENT(IN), OPTIONAL       :: pbd_resp, pbc_resp, pad_resp, pac_resp
      LOGICAL, INTENT(IN), OPTIONAL                      :: resp_only

      INTEGER                                            :: i, j, ma, mb, mc, md, p_index
      LOGICAL                                            :: with_resp_density, full_force
      REAL(dp)                                           :: temp1, temp1_resp, temp3, temp3_resp, &
                                                            temp4, teresp

      with_resp_density = .FALSE.
      IF (PRESENT(pbd_resp) .AND. &
          PRESENT(pbc_resp) .AND. &
          PRESENT(pad_resp) .AND. &
          PRESENT(pac_resp)) with_resp_density = .TRUE.

      IF (with_resp_density) THEN
         full_force = .TRUE.
         IF (PRESENT(resp_only)) full_force = .NOT. resp_only
         p_index = 0
         temp4 = 0.0_dp
         DO md = 1, md_max
            DO mc = 1, mc_max
               DO mb = 1, mb_max
                  temp1 = pbc((mc - 1)*mb_max + mb)*fac
                  temp3 = pbd((md - 1)*mb_max + mb)*fac
                  temp1_resp = pbc_resp((mc - 1)*mb_max + mb)*fac
                  temp3_resp = pbd_resp((md - 1)*mb_max + mb)*fac
                  DO ma = 1, ma_max
                     p_index = p_index + 1
                     ! HF-SCF
                     IF (full_force) THEN
                        teresp = temp1*pad((md - 1)*ma_max + ma) + &
                                 temp3*pac((mc - 1)*ma_max + ma)
                     ELSE
                        teresp = 0.0_dp
                     END IF
                     ! RESP+HF
                     teresp = teresp + &
                              pac((mc - 1)*ma_max + ma)*temp3_resp + &
                              pac_resp((mc - 1)*ma_max + ma)*temp3 + &
                              pad((md - 1)*ma_max + ma)*temp1_resp + &
                              pad_resp((md - 1)*ma_max + ma)*temp1
                     temp4 = temp4 + teresp*prim(p_index)
                  END DO !ma
               END DO !mb
            END DO !mc
         END DO !md
      ELSE
         p_index = 0
         temp4 = 0.0_dp
         DO md = 1, md_max
            DO mc = 1, mc_max
               DO mb = 1, mb_max
                  temp1 = pbc((mc - 1)*mb_max + mb)*fac
                  temp3 = pbd((md - 1)*mb_max + mb)*fac
                  DO ma = 1, ma_max
                     p_index = p_index + 1
                     teresp = temp1*pad((md - 1)*ma_max + ma) + &
                              temp3*pac((mc - 1)*ma_max + ma)
                     temp4 = temp4 + teresp*prim(p_index)
                  END DO !ma
               END DO !mb
            END DO !mc
         END DO !md
      END IF

      j = l
      i = MOD(coord - 1, 3) + 1
      tmp_virial(i, j) = tmp_virial(i, j) - temp4
   END SUBROUTINE update_virial

   #:include 'hfx_get_pmax_val.fypp'

END MODULE hfx_derivatives
